From a0f9a0d350d5abe5070d34f6f5b6bc7044ef071f Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Sun, 27 Feb 2022 01:52:33 +0300 Subject: [PATCH 01/55] safekeeper to cosnole call fix (#1333) (#1334) --- .circleci/ansible/deploy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index 06385aa0d3..c95524a8a5 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -169,6 +169,6 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ hostvars.localhost.zenith.console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID tags: - safekeeper From c7c1e19667e7c2ff4b699d28c314ef7cce2770b0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 1 Mar 2022 13:52:49 +0200 Subject: [PATCH 02/55] Use more generics, less dyn --- pageserver/src/basebackup.rs | 12 +++--- pageserver/src/branches.rs | 17 ++++---- pageserver/src/http/routes.rs | 4 +- pageserver/src/import_datadir.rs | 15 ++++--- pageserver/src/layered_repository.rs | 16 ++++--- pageserver/src/lib.rs | 6 +++ pageserver/src/page_service.rs | 18 ++++---- pageserver/src/repository.rs | 36 ++++++++-------- pageserver/src/tenant_mgr.rs | 11 ++--- pageserver/src/tenant_threads.rs | 1 + pageserver/src/walingest.rs | 63 +++++++++++++++------------- pageserver/src/walreceiver.rs | 6 ++- 12 files changed, 113 insertions(+), 92 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 1ee48eb2fc..396e93acc1 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -29,9 +29,9 @@ use zenith_utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a> { +pub struct Basebackup<'a, T> { ar: Builder<&'a mut dyn Write>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, } @@ -43,12 +43,14 @@ pub struct Basebackup<'a> { // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a> Basebackup<'a> { +impl<'a, T> Basebackup<'a, T> +where T: Timeline, +{ pub fn new( write: &'a mut dyn Write, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 43f27af5ea..8b27762ed5 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -25,6 +25,7 @@ use crate::CheckpointConfig; use crate::{config::PageServerConf, repository::Repository}; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{repository::RepositoryTimeline, tenant_mgr}; +use crate::repository::Timeline; #[derive(Serialize, Deserialize, Clone)] pub struct BranchInfo { @@ -39,9 +40,9 @@ pub struct BranchInfo { } impl BranchInfo { - pub fn from_path>( - path: T, - repo: &Arc, + pub fn from_path>( + path: P, + repo: &R, include_non_incremental_logical_size: bool, ) -> Result { let path = path.as_ref(); @@ -129,7 +130,7 @@ pub fn create_repo( conf: &'static PageServerConf, tenantid: ZTenantId, wal_redo_manager: Arc, -) -> Result> { +) -> Result> { let repo_dir = conf.tenant_path(&tenantid); if repo_dir.exists() { bail!("repo for {} already exists", tenantid) @@ -211,11 +212,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { // - run initdb to init temporary instance and get bootstrap data // - after initialization complete, remove the temp dir. // -fn bootstrap_timeline( +fn bootstrap_timeline( conf: &'static PageServerConf, tenantid: ZTenantId, tli: ZTimelineId, - repo: &dyn Repository, + repo: &R, ) -> Result<()> { let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); @@ -234,7 +235,7 @@ fn bootstrap_timeline( let timeline = repo.create_empty_timeline(tli, lsn)?; import_datadir::import_timeline_from_postgres_datadir( &pgdata_path, - timeline.writer().as_ref(), + &*timeline, lsn, )?; timeline.checkpoint(CheckpointConfig::Forced)?; @@ -284,7 +285,7 @@ pub(crate) fn get_branches( })?; BranchInfo::from_path( dir_entry.path(), - &repo, + repo.as_ref(), include_non_incremental_logical_size, ) }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 26d473efaf..f98978c7c8 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -26,7 +26,7 @@ use super::models::BranchCreateRequest; use super::models::StatusResponse; use super::models::TenantCreateRequest; use crate::branches::BranchInfo; -use crate::repository::RepositoryTimeline; +use crate::repository::{Repository, RepositoryTimeline, Timeline}; use crate::repository::TimelineSyncState; use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; @@ -138,7 +138,7 @@ async fn branch_detail_handler(request: Request) -> Result, let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, &repo, include_non_incremental_logical_size) + BranchInfo::from_path(path, repo.as_ref(), include_non_incremental_logical_size) }) .await .map_err(ApiError::from_err)??; diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index e317118bb5..dd3d6e7029 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -27,13 +27,16 @@ use zenith_utils::lsn::Lsn; /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - writer: &dyn TimelineWriter, + timeline: &T, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; + let writer_box = timeline.writer(); + let writer = writer_box.as_ref(); + // Scan 'global' for direntry in fs::read_dir(path.join("global"))? { let direntry = direntry?; @@ -141,6 +144,7 @@ pub fn import_timeline_from_postgres_datadir( // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( &path.join("pg_wal"), + timeline, writer, Lsn(pg_control.checkPointCopy.redo), lsn, @@ -310,8 +314,9 @@ fn import_slru_file( /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, + timeline: &T, writer: &dyn TimelineWriter, startpoint: Lsn, endpoint: Lsn, @@ -322,7 +327,7 @@ fn import_wal( let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(writer.deref(), startpoint)?; + let mut walingest = WalIngest::new(timeline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -355,7 +360,7 @@ fn import_wal( let mut nrecords = 0; while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(writer, recdata, lsn)?; + walingest.ingest_record(timeline, writer, recdata, lsn)?; last_lsn = lsn; nrecords += 1; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 975b2f5d2b..909477b722 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -136,7 +136,9 @@ pub struct LayeredRepository { /// Public interface impl Repository for LayeredRepository { - fn get_timeline(&self, timelineid: ZTimelineId) -> Result { + type Timeline = LayeredTimeline; + + fn get_timeline(&self, timelineid: ZTimelineId) -> Result> { let mut timelines = self.timelines.lock().unwrap(); Ok( match self.get_or_init_timeline(timelineid, &mut timelines)? { @@ -156,7 +158,7 @@ impl Repository for LayeredRepository { &self, timelineid: ZTimelineId, initdb_lsn: Lsn, - ) -> Result> { + ) -> Result> { let mut timelines = self.timelines.lock().unwrap(); // Create the timeline directory, and write initial metadata to file. @@ -1073,10 +1075,6 @@ impl Timeline for LayeredTimeline { _write_guard: self.write_lock.lock().unwrap(), }) } - - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline { - self - } } impl LayeredTimeline { @@ -2143,20 +2141,20 @@ impl LayeredTimeline { } } -struct LayeredTimelineWriter<'a> { +pub struct LayeredTimelineWriter<'a> { tl: &'a LayeredTimeline, _write_guard: MutexGuard<'a, ()>, } impl Deref for LayeredTimelineWriter<'_> { - type Target = dyn Timeline; + type Target = LayeredTimeline; fn deref(&self) -> &Self::Target { self.tl } } -impl<'a> TimelineWriter for LayeredTimelineWriter<'a> { +impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { fn put_wal_record( &self, lsn: Lsn, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3a68f56187..a8a878c448 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -22,6 +22,8 @@ use lazy_static::lazy_static; use zenith_metrics::{register_int_gauge_vec, IntGaugeVec}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use layered_repository::{LayeredRepository, LayeredTimeline}; + lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( "pageserver_live_connections_count", @@ -43,3 +45,7 @@ pub enum CheckpointConfig { // Flush all in-memory data and reconstruct all page images Forced, } + +pub type RepositoryImpl = LayeredRepository; +pub type TimelineImpl = LayeredTimeline; + diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..bf20cfb0db 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -33,7 +33,7 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::basebackup; use crate::config::PageServerConf; use crate::relish::*; -use crate::repository::Timeline; +use crate::repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -395,8 +395,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &dyn Timeline, + fn wait_or_get_last_lsn( + timeline: &T, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -443,9 +443,9 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &dyn Timeline, + timeline: &T, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -461,9 +461,9 @@ impl PageServerHandler { })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &dyn Timeline, + timeline: &T, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); @@ -482,9 +482,9 @@ impl PageServerHandler { })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &dyn Timeline, + timeline: &T, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6142953a58..49aa04ea7c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -6,7 +6,7 @@ use bytes::Bytes; use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::HashSet; -use std::ops::{AddAssign, Deref}; +use std::ops::AddAssign; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; @@ -19,6 +19,8 @@ pub type BlockNumber = u32; /// A repository corresponds to one .zenith directory. One repository holds multiple /// timelines, forked off from the same initial call to 'initdb'. pub trait Repository: Send + Sync { + type Timeline: Timeline; + fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>; /// Updates timeline based on the new sync state, received from the remote storage synchronization. @@ -34,7 +36,7 @@ pub trait Repository: Send + Sync { fn get_timeline_state(&self, timeline_id: ZTimelineId) -> Option; /// Get Timeline handle for given zenith timeline ID. - fn get_timeline(&self, timelineid: ZTimelineId) -> Result; + fn get_timeline(&self, timelineid: ZTimelineId) -> Result>; /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. @@ -42,7 +44,7 @@ pub trait Repository: Send + Sync { &self, timelineid: ZTimelineId, initdb_lsn: Lsn, - ) -> Result>; + ) -> Result>; /// Branch a timeline fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; @@ -69,10 +71,10 @@ pub trait Repository: Send + Sync { } /// A timeline, that belongs to the current repository. -pub enum RepositoryTimeline { +pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Local(Arc), + Local(Arc), /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. Remote { id: ZTimelineId, @@ -81,8 +83,8 @@ pub enum RepositoryTimeline { }, } -impl RepositoryTimeline { - pub fn local_timeline(&self) -> Option> { +impl RepositoryTimeline { + pub fn local_timeline(&self) -> Option> { if let Self::Local(local_timeline) = self { Some(Arc::clone(local_timeline)) } else { @@ -217,7 +219,6 @@ pub trait Timeline: Send + Sync { // // These are called by the WAL receiver to digest WAL records. //------------------------------------------------------------------------------ - /// Atomically get both last and prev. fn get_last_record_rlsn(&self) -> RecordLsn; @@ -229,6 +230,10 @@ pub trait Timeline: Send + Sync { fn get_disk_consistent_lsn(&self) -> Lsn; /// Mutate the timeline with a [`TimelineWriter`]. + /// + /// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter + /// is a generic type in this trait. But that doesn't currently work in + /// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html fn writer<'a>(&'a self) -> Box; /// @@ -255,16 +260,13 @@ pub trait Timeline: Send + Sync { /// Does the same as get_current_logical_size but counted on demand. /// Used in tests to ensure that incremental and non incremental variants match. fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result; - - /// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline. - fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline; } /// Various functions to mutate the timeline. // TODO Currently, Deref is used to allow easy access to read methods from this trait. // This is probably considered a bad practice in Rust and should be fixed eventually, // but will cause large code changes. -pub trait TimelineWriter: Deref { +pub trait TimelineWriter<'a> { /// Put a new page version that can be constructed from a WAL record /// /// This will implicitly extend the relation, if the page is beyond the @@ -395,15 +397,15 @@ pub mod repo_harness { Ok(Self { conf, tenant_id }) } - pub fn load(&self) -> Box { + pub fn load(&self) -> LayeredRepository { let walredo_mgr = Arc::new(TestRedoManager); - Box::new(LayeredRepository::new( + LayeredRepository::new( self.conf, walredo_mgr, self.tenant_id, false, - )) + ) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -467,7 +469,7 @@ mod tests { forknum: 0, }); - fn assert_current_logical_size(timeline: &Arc, lsn: Lsn) { + fn assert_current_logical_size(timeline: &Arc, lsn: Lsn) { let incremental = timeline.get_current_logical_size(); let non_incremental = timeline .get_current_logical_size_non_incremental(lsn) @@ -915,7 +917,7 @@ mod tests { Ok(()) } - fn make_some_layers(tline: &Arc, start_lsn: Lsn) -> Result<()> { + fn make_some_layers(tline: &Arc, start_lsn: Lsn) -> Result<()> { let mut lsn = start_lsn; { let writer = tline.writer(); diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d60b5fefd3..c0b54278cd 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,9 +2,10 @@ //! page server. use crate::branches; +use crate::{RepositoryImpl, TimelineImpl}; use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; -use crate::repository::{Repository, Timeline, TimelineSyncState}; +use crate::repository::{Repository, TimelineSyncState}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; @@ -24,7 +25,7 @@ lazy_static! { struct Tenant { state: TenantState, - repo: Arc, + repo: Arc, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -78,7 +79,7 @@ pub fn set_timeline_states( let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( + let repo: Arc = Arc::new(LayeredRepository::new( conf, Arc::new(walredo_mgr), tenant_id, @@ -248,7 +249,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re Ok(()) } -pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { +pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result> { let m = access_tenants(); let tenant = m .get(&tenantid) @@ -260,7 +261,7 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result Result> { +) -> Result> { get_repository_for_tenant(tenantid)? .get_timeline(timelineid)? .local_timeline() diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index 062af9f1ad..673a92b80d 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -1,6 +1,7 @@ //! This module contains functions to serve per-tenant background processes, //! such as checkpointer and GC use crate::config::PageServerConf; +use crate::repository::Repository; use crate::tenant_mgr; use crate::tenant_mgr::TenantState; use crate::CheckpointConfig; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 1962c9bbd3..615a9960fe 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -46,7 +46,7 @@ pub struct WalIngest { } impl WalIngest { - pub fn new(timeline: &dyn Timeline, startpoint: Lsn) -> Result { + pub fn new(timeline: &T, startpoint: Lsn) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?; @@ -66,9 +66,10 @@ impl WalIngest { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// - pub fn ingest_record( + pub fn ingest_record( &mut self, - timeline: &dyn TimelineWriter, + timeline: &T, + writer: &dyn TimelineWriter, recdata: Bytes, lsn: Lsn, ) -> Result<()> { @@ -86,7 +87,7 @@ impl WalIngest { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, timeline, lsn, &mut decoded)?; + self.ingest_heapam_record(&mut buf, writer, lsn, &mut decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID @@ -94,13 +95,13 @@ impl WalIngest { == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(timeline, lsn, &truncate)?; + self.ingest_xlog_smgr_truncate(writer, lsn, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(timeline, lsn, &createdb)?; + self.ingest_xlog_dbase_create(timeline, writer, lsn, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { @@ -113,7 +114,7 @@ impl WalIngest { for tablespace_id in dropdb.tablespace_ids { let rels = timeline.list_rels(tablespace_id, dropdb.db_id, req_lsn)?; for rel in rels { - timeline.drop_relish(rel, lsn)?; + writer.drop_relish(rel, lsn)?; } trace!( "Drop FileNodeMap {}, {} at lsn {}", @@ -121,7 +122,7 @@ impl WalIngest { dropdb.db_id, lsn ); - timeline.drop_relish( + writer.drop_relish( RelishTag::FileNodeMap { spcnode: tablespace_id, dbnode: dropdb.db_id, @@ -138,7 +139,7 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( + writer.put_page_image( RelishTag::Slru { slru: SlruKind::Clog, segno, @@ -150,7 +151,7 @@ impl WalIngest { } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_clog_truncate_record(timeline, writer, lsn, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -158,7 +159,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, + writer, lsn, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, @@ -169,7 +170,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - timeline, + writer, lsn, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, @@ -181,14 +182,14 @@ impl WalIngest { parsed_xact.xid, lsn ); - timeline.drop_relish( + writer.drop_relish( RelishTag::TwoPhase { xid: parsed_xact.xid, }, lsn, )?; } else if info == pg_constants::XLOG_XACT_PREPARE { - timeline.put_page_image( + writer.put_page_image( RelishTag::TwoPhase { xid: decoded.xl_xid, }, @@ -204,7 +205,7 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( + writer.put_page_image( RelishTag::Slru { slru: SlruKind::MultiXactOffsets, segno, @@ -217,7 +218,7 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_page_image( + writer.put_page_image( RelishTag::Slru { slru: SlruKind::MultiXactMembers, segno, @@ -228,14 +229,14 @@ impl WalIngest { )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_create_record(writer, lsn, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(timeline, lsn, &xlrec)?; + self.ingest_multixact_truncate_record(writer, lsn, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(timeline, lsn, &xlrec, &decoded)?; + self.ingest_relmap_page(writer, lsn, &xlrec, &decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -270,20 +271,20 @@ impl WalIngest { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(timeline, lsn, &decoded, blk)?; + self.ingest_decoded_block(writer, lsn, &decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { let new_checkpoint_bytes = self.checkpoint.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, new_checkpoint_bytes)?; + writer.put_page_image(RelishTag::Checkpoint, 0, lsn, new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - timeline.advance_last_record_lsn(lsn); + writer.advance_last_record_lsn(lsn); Ok(()) } @@ -465,9 +466,10 @@ impl WalIngest { } /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. - fn ingest_xlog_dbase_create( + fn ingest_xlog_dbase_create( &mut self, - timeline: &dyn TimelineWriter, + timeline: &T, + writer: &dyn TimelineWriter, lsn: Lsn, rec: &XlCreateDatabase, ) -> Result<()> { @@ -508,13 +510,13 @@ impl WalIngest { debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - timeline.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?; + writer.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?; num_blocks_copied += 1; } if nblocks == 0 { // make sure we have some trace of the relation, even if it's empty - timeline.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?; + writer.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?; } num_rels_copied += 1; @@ -532,7 +534,7 @@ impl WalIngest { spcnode: tablespace_id, dbnode: db_id, }; - timeline.put_page_image(new_tag, 0, lsn, img)?; + writer.put_page_image(new_tag, 0, lsn, img)?; break; } } @@ -680,9 +682,10 @@ impl WalIngest { Ok(()) } - fn ingest_clog_truncate_record( + fn ingest_clog_truncate_record( &mut self, - timeline: &dyn TimelineWriter, + timeline: &T, + writer: &dyn TimelineWriter, lsn: Lsn, xlrec: &XlClogTruncate, ) -> Result<()> { @@ -732,7 +735,7 @@ impl WalIngest { if slru == SlruKind::Clog { let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; if slru_may_delete_clogsegment(segpage, xlrec.pageno) { - timeline.drop_relish(RelishTag::Slru { slru, segno }, lsn)?; + writer.drop_relish(RelishTag::Slru { slru, segno }, lsn)?; trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn); } } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 6fff1d062d..abf6bace22 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -6,6 +6,8 @@ //! We keep one WAL receiver active per timeline. use crate::config::PageServerConf; +use crate::repository::Repository; +use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -250,11 +252,11 @@ fn walreceiver_main( // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are - // at risk of hittind a deadlock. + // at risk of hitting a deadlock. assert!(lsn.is_aligned()); let writer = timeline.writer(); - walingest.ingest_record(writer.as_ref(), recdata, lsn)?; + walingest.ingest_record(&*timeline, writer.as_ref(), recdata, lsn)?; fail_point!("walreceiver-after-ingest"); From 6127b6638bb78fce23df353e29c0ff917dc4da33 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 7 Mar 2022 14:04:46 +0200 Subject: [PATCH 03/55] Major storage format rewrite Major changes and new concepts: Simplify Repository to a value-store ------------------------------------ Move the responsibility of tracking relation metadata, like which relations exist and what are their sizes, from Repository to a new module, pgdatadir_mapping.rs. The interface to Repository is now a simple key-value PUT/GET operations. It's still not any old key-value store though. A Repository is still responsible from handling branching, and every GET operation comes with an LSN. Key --- The key to the Repository key-value store is a Key struct, which consists of a few integer fields. It's wide enough to store a full RelFileNode, fork and block number, and to distinguish those from metadata keys. See pgdatadir_mapping.rs for how relation blocks and metadata keys are mapped to the Key struct. Store arbitrary key-ranges in the layer files --------------------------------------------- The concept of a "segment" is gone. Each layer file can store an arbitrary range of Keys. TODO: - Deleting keys, to reclaim space. This isn't visible to Postgres, dropping or truncating a relation works as you would expect if you look at it from the compute node. If you drop a relation, for example, the relation is removed from the metadata entry, so that it appears to be gone. However, the layered repository implementation never reclaims the storage. - Tracking "logical database size", for disk space quotas. That ought to be reimplemented now in pgdatadir_mapping.rs, or perhaps in walingest.rs. - LSM compaction. The logic for checkpointing and creating image layers is very dumb. AFAIK the *read* code could deal with a full-fledged LSM tree now consisting of the delta and image layers. But there's no code to take a bunch of delta layers and compact them, and the heuristics for when to create image layers is pretty dumb. - The code to track the layers is inefficient. All layers are just stored in a vector, and whenever we need to find a layer, we do a linear search in it. --- Cargo.lock | 1 + pageserver/Cargo.toml | 1 + pageserver/src/basebackup.rs | 77 +- pageserver/src/branches.rs | 43 +- pageserver/src/http/routes.rs | 3 +- pageserver/src/import_datadir.rs | 200 +- pageserver/src/layered_repository.rs | 1910 +++++++---------- .../src/layered_repository/delta_layer.rs | 700 +++--- pageserver/src/layered_repository/filename.rs | 299 +-- .../layered_repository/global_layer_map.rs | 142 -- .../src/layered_repository/image_layer.rs | 348 ++- .../src/layered_repository/inmemory_layer.rs | 711 ++---- .../src/layered_repository/interval_tree.rs | 468 ---- .../src/layered_repository/layer_map.rs | 698 +++--- .../src/layered_repository/storage_layer.rs | 126 +- pageserver/src/layered_repository/utils.rs | 48 + pageserver/src/lib.rs | 6 +- pageserver/src/page_service.rs | 115 +- pageserver/src/pgdatadir_mapping.rs | 1173 ++++++++++ pageserver/src/remote_storage/local_fs.rs | 2 +- pageserver/src/remote_storage/storage_sync.rs | 2 +- pageserver/src/repository.rs | 1085 +++------- pageserver/src/tenant_mgr.rs | 36 +- pageserver/src/tenant_threads.rs | 9 + pageserver/src/walingest.rs | 865 ++++++-- pageserver/src/walreceiver.rs | 4 +- pageserver/src/walrecord.rs | 64 +- pageserver/src/walredo.rs | 166 +- postgres_ffi/src/pg_constants.rs | 4 +- test_runner/fixtures/utils.py | 5 +- 30 files changed, 4483 insertions(+), 4828 deletions(-) delete mode 100644 pageserver/src/layered_repository/global_layer_map.rs delete mode 100644 pageserver/src/layered_repository/interval_tree.rs create mode 100644 pageserver/src/layered_repository/utils.rs create mode 100644 pageserver/src/pgdatadir_mapping.rs diff --git a/Cargo.lock b/Cargo.lock index ad38a41d91..8b422f0481 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1307,6 +1307,7 @@ dependencies = [ "hex-literal", "humantime", "hyper", + "itertools", "lazy_static", "log", "nix", diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index cfcb453732..c2c0e7efba 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] } byteorder = "1.4.3" futures = "0.3.13" hyper = "0.14" +itertools = "0.10.3" lazy_static = "1.4.0" log = "0.4.14" clap = "3.0" diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 396e93acc1..8303804213 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -22,6 +22,7 @@ use tar::{Builder, EntryType, Header}; use crate::relish::*; use crate::repository::Timeline; +use crate::DatadirTimelineImpl; use postgres_ffi::xlog_utils::*; use postgres_ffi::*; use zenith_utils::lsn::Lsn; @@ -29,9 +30,9 @@ use zenith_utils::lsn::Lsn; /// This is short-living object only for the time of tarball creation, /// created mostly to avoid passing a lot of parameters between various functions /// used for constructing tarball. -pub struct Basebackup<'a, T> { +pub struct Basebackup<'a> { ar: Builder<&'a mut dyn Write>, - timeline: &'a Arc, + timeline: &'a Arc, pub lsn: Lsn, prev_record_lsn: Lsn, } @@ -43,14 +44,12 @@ pub struct Basebackup<'a, T> { // * When working without safekeepers. In this situation it is important to match the lsn // we are taking basebackup on with the lsn that is used in pageserver's walreceiver // to start the replication. -impl<'a, T> Basebackup<'a, T> -where T: Timeline, -{ +impl<'a> Basebackup<'a> { pub fn new( write: &'a mut dyn Write, - timeline: &'a Arc, + timeline: &'a Arc, req_lsn: Option, - ) -> Result> { + ) -> Result> { // Compute postgres doesn't have any previous WAL files, but the first // record that it's going to write needs to include the LSN of the // previous record (xl_prev). We include prev_record_lsn in the @@ -66,7 +65,7 @@ where T: Timeline, // prev_lsn to Lsn(0) if we cannot provide the correct value. let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn { // Backup was requested at a particular LSN. Wait for it to arrive. - timeline.wait_lsn(req_lsn)?; + timeline.tline.wait_lsn(req_lsn)?; // If the requested point is the end of the timeline, we can // provide prev_lsn. (get_last_record_rlsn() might return it as @@ -117,20 +116,21 @@ where T: Timeline, } // Gather non-relational files from object storage pages. - for obj in self.timeline.list_nonrels(self.lsn)? { - match obj { - RelishTag::Slru { slru, segno } => { - self.add_slru_segment(slru, segno)?; - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - self.add_relmap_file(spcnode, dbnode)?; - } - RelishTag::TwoPhase { xid } => { - self.add_twophase_file(xid)?; - } - _ => {} + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactOffsets, + SlruKind::MultiXactMembers, + ] { + for segno in self.timeline.list_slru_segments(kind, self.lsn)? { + self.add_slru_segment(kind, segno)?; } } + for (spcnode, dbnode) in self.timeline.list_relmap_files(self.lsn)? { + self.add_relmap_file(spcnode, dbnode)?; + } + for xid in self.timeline.list_twophase_files(self.lsn)? { + self.add_twophase_file(xid)?; + } // Generate pg_control and bootstrap WAL segment. self.add_pgcontrol_file()?; @@ -143,27 +143,14 @@ where T: Timeline, // Generate SLRU segment files from repository. // fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> { - let seg_size = self - .timeline - .get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?; - - if seg_size == None { - trace!( - "SLRU segment {}/{:>04X} was truncated", - slru.to_str(), - segno - ); - return Ok(()); - } - - let nblocks = seg_size.unwrap(); + let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?; let mut slru_buf: Vec = Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize); for blknum in 0..nblocks { - let img = - self.timeline - .get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?; + let img = self + .timeline + .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?; assert!(img.len() == pg_constants::BLCKSZ as usize); slru_buf.extend_from_slice(&img); @@ -182,11 +169,7 @@ where T: Timeline, // Along with them also send PG_VERSION for each database. // fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> { - let img = self.timeline.get_page_at_lsn( - RelishTag::FileNodeMap { spcnode, dbnode }, - 0, - self.lsn, - )?; + let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID { let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; @@ -222,9 +205,7 @@ where T: Timeline, // Extract twophase state files // fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> { - let img = self - .timeline - .get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?; + let img = self.timeline.get_twophase_file(xid, self.lsn)?; let mut buf = BytesMut::new(); buf.extend_from_slice(&img[..]); @@ -244,11 +225,11 @@ where T: Timeline, fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> { let checkpoint_bytes = self .timeline - .get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn) + .get_checkpoint(self.lsn) .context("failed to get checkpoint bytes")?; let pg_control_bytes = self .timeline - .get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn) + .get_control_file(self.lsn) .context("failed get control bytes")?; let mut pg_control = ControlFileData::decode(&pg_control_bytes)?; let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?; @@ -269,7 +250,7 @@ where T: Timeline, // add zenith.signal file let mut zenith_signal = String::new(); if self.prev_record_lsn == Lsn(0) { - if self.lsn == self.timeline.get_ancestor_lsn() { + if self.lsn == self.timeline.tline.get_ancestor_lsn() { write!(zenith_signal, "PREV LSN: none")?; } else { write!(zenith_signal, "PREV LSN: invalid")?; diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 8b27762ed5..e9280f3de7 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -20,12 +20,14 @@ use zenith_utils::lsn::Lsn; use zenith_utils::zid::{ZTenantId, ZTimelineId}; use zenith_utils::{crashsafe_dir, logging}; +use crate::config::PageServerConf; +use crate::pgdatadir_mapping::DatadirTimeline; +use crate::repository::{Repository, Timeline}; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; -use crate::{config::PageServerConf, repository::Repository}; +use crate::RepositoryImpl; use crate::{import_datadir, LOG_FILE_NAME}; use crate::{repository::RepositoryTimeline, tenant_mgr}; -use crate::repository::Timeline; #[derive(Serialize, Deserialize, Clone)] pub struct BranchInfo { @@ -40,10 +42,10 @@ pub struct BranchInfo { } impl BranchInfo { - pub fn from_path>( - path: P, + pub fn from_path>( + path: T, repo: &R, - include_non_incremental_logical_size: bool, + _include_non_incremental_logical_size: bool, ) -> Result { let path = path.as_ref(); let name = path.file_name().unwrap().to_string_lossy().to_string(); @@ -74,11 +76,17 @@ impl BranchInfo { // non incremental size calculation can be heavy, so let it be optional // needed for tests to check size calculation + // + // FIXME + /* let current_logical_size_non_incremental = include_non_incremental_logical_size .then(|| { timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) }) .transpose()?; + */ + let current_logical_size_non_incremental = Some(0); + let current_logical_size = 0; Ok(BranchInfo { name, @@ -86,7 +94,7 @@ impl BranchInfo { latest_valid_lsn: timeline.get_last_record_lsn(), ancestor_id, ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), + current_logical_size, // : timeline.get_current_logical_size(), current_logical_size_non_incremental, }) } @@ -130,7 +138,7 @@ pub fn create_repo( conf: &'static PageServerConf, tenantid: ZTenantId, wal_redo_manager: Arc, -) -> Result> { +) -> Result> { let repo_dir = conf.tenant_path(&tenantid); if repo_dir.exists() { bail!("repo for {} already exists", tenantid) @@ -152,19 +160,19 @@ pub fn create_repo( crashsafe_dir::create_dir(&timelinedir)?; - let repo = Arc::new(crate::layered_repository::LayeredRepository::new( + let repo = crate::layered_repository::LayeredRepository::new( conf, wal_redo_manager, tenantid, conf.remote_storage_config.is_some(), - )); + ); // Load data into pageserver // TODO To implement zenith import we need to // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; + bootstrap_timeline(conf, tenantid, timeline_id, &repo)?; - Ok(repo) + Ok(Arc::new(repo)) } // Returns checkpoint LSN from controlfile @@ -233,17 +241,16 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - import_datadir::import_timeline_from_postgres_datadir( - &pgdata_path, - &*timeline, - lsn, - )?; - timeline.checkpoint(CheckpointConfig::Forced)?; + + let page_tline: DatadirTimeline = DatadirTimeline::new(timeline); + + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &page_tline, lsn)?; + page_tline.tline.checkpoint(CheckpointConfig::Forced)?; println!( "created initial timeline {} timeline.lsn {}", tli, - timeline.get_last_record_lsn() + page_tline.tline.get_last_record_lsn() ); let data = tli.to_string(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index f98978c7c8..4794bf72b9 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -26,8 +26,9 @@ use super::models::BranchCreateRequest; use super::models::StatusResponse; use super::models::TenantCreateRequest; use crate::branches::BranchInfo; -use crate::repository::{Repository, RepositoryTimeline, Timeline}; +use crate::repository::RepositoryTimeline; use crate::repository::TimelineSyncState; +use crate::repository::{Repository, Timeline}; use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; #[derive(Debug)] diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index dd3d6e7029..5653c9a7ad 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -11,14 +11,15 @@ use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use tracing::*; +use crate::pgdatadir_mapping::*; use crate::relish::*; -use crate::repository::*; +use crate::repository::Repository; use crate::walingest::WalIngest; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::*; use postgres_ffi::xlog_utils::*; -use postgres_ffi::Oid; use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED}; +use postgres_ffi::{Oid, TransactionId}; use zenith_utils::lsn::Lsn; /// @@ -27,45 +28,43 @@ use zenith_utils::lsn::Lsn; /// This is currently only used to import a cluster freshly created by initdb. /// The code that deals with the checkpoint would not work right if the /// cluster was not shut down cleanly. -pub fn import_timeline_from_postgres_datadir( +pub fn import_timeline_from_postgres_datadir( path: &Path, - timeline: &T, + tline: &DatadirTimeline, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; - let writer_box = timeline.writer(); - let writer = writer_box.as_ref(); + let mut writer = tline.begin_record(lsn); + writer.init_empty()?; // Scan 'global' + let mut relfiles: Vec = Vec::new(); + writer.put_dbdir_creation(pg_constants::GLOBALTABLESPACE_OID, 0)?; for direntry in fs::read_dir(path.join("global"))? { let direntry = direntry?; match direntry.file_name().to_str() { None => continue, Some("pg_control") => { - pg_control = Some(import_control_file(writer, lsn, &direntry.path())?); + pg_control = Some(import_control_file(&mut writer, &direntry.path())?); + } + Some("pg_filenode.map") => { + import_relmap_file( + &mut writer, + pg_constants::GLOBALTABLESPACE_OID, + 0, + &direntry.path(), + )?; } - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::GLOBALTABLESPACE_OID, - dbnode: 0, - }, - &direntry.path(), - )?, - // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::GLOBALTABLESPACE_OID, - 0, - )?, + // Load any relation files into the page server (but only after the other files) + _ => relfiles.push(direntry.path()), } } + for relfile in relfiles { + import_relfile(&mut writer, &relfile, pg_constants::GLOBALTABLESPACE_OID, 0)?; + } // Scan 'base'. It contains database dirs, the database OID is the filename. // E.g. 'base/12345', where 12345 is the database OID. @@ -79,54 +78,56 @@ pub fn import_timeline_from_postgres_datadir( let dboid = direntry.file_name().to_str().unwrap().parse::()?; + let mut relfiles: Vec = Vec::new(); for direntry in fs::read_dir(direntry.path())? { let direntry = direntry?; match direntry.file_name().to_str() { None => continue, - Some("PG_VERSION") => continue, - Some("pg_filenode.map") => import_nonrel_file( - writer, - lsn, - RelishTag::FileNodeMap { - spcnode: pg_constants::DEFAULTTABLESPACE_OID, - dbnode: dboid, - }, + Some("PG_VERSION") => { + writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; + } + Some("pg_filenode.map") => import_relmap_file( + &mut writer, + pg_constants::DEFAULTTABLESPACE_OID, + dboid, &direntry.path(), )?, // Load any relation files into the page server - _ => import_relfile( - &direntry.path(), - writer, - lsn, - pg_constants::DEFAULTTABLESPACE_OID, - dboid, - )?, + _ => relfiles.push(direntry.path()), } } + for relfile in relfiles { + import_relfile( + &mut writer, + &relfile, + pg_constants::DEFAULTTABLESPACE_OID, + dboid, + )?; + } } for entry in fs::read_dir(path.join("pg_xact"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?; + import_slru_file(&mut writer, SlruKind::Clog, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("members"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?; + import_slru_file(&mut writer, SlruKind::MultiXactMembers, &entry.path())?; } for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? { let entry = entry?; - import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?; + import_slru_file(&mut writer, SlruKind::MultiXactOffsets, &entry.path())?; } for entry in fs::read_dir(path.join("pg_twophase"))? { let entry = entry?; let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?; - import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?; + import_twophase_file(&mut writer, xid, &entry.path())?; } // TODO: Scan pg_tblspc // We're done importing all the data files. - writer.advance_last_record_lsn(lsn); + writer.finish()?; // We expect the Postgres server to be shut down cleanly. let pg_control = pg_control.context("pg_control file not found")?; @@ -144,8 +145,7 @@ pub fn import_timeline_from_postgres_datadir( // *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'. import_wal( &path.join("pg_wal"), - timeline, - writer, + tline, Lsn(pg_control.checkPointCopy.redo), lsn, )?; @@ -154,10 +154,9 @@ pub fn import_timeline_from_postgres_datadir( } // subroutine of import_timeline_from_postgres_datadir(), to load one relation file. -fn import_relfile( +fn import_relfile( + timeline: &mut DatadirTimelineWriter, path: &Path, - timeline: &dyn TimelineWriter, - lsn: Lsn, spcoid: Oid, dboid: Oid, ) -> Result<()> { @@ -174,19 +173,28 @@ fn import_relfile( let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; + let len = file.metadata().unwrap().len(); + ensure!(len % pg_constants::BLCKSZ as u64 == 0); + let nblocks = len / pg_constants::BLCKSZ as u64; + + if segno != 0 { + todo!(); + } + + let rel = RelTag { + spcnode: spcoid, + dbnode: dboid, + relnode, + forknum, + }; + timeline.put_rel_creation(rel, nblocks as u32)?; + let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32); loop { let r = file.read_exact(&mut buf); match r { Ok(_) => { - let rel = RelTag { - spcnode: spcoid, - dbnode: dboid, - relnode, - forknum, - }; - let tag = RelishTag::Relation(rel); - timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?; + timeline.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?; } // TODO: UnexpectedEof is expected @@ -203,20 +211,37 @@ fn import_relfile( }; blknum += 1; } + ensure!(blknum == nblocks as u32); Ok(()) } -/// +/// FIXME /// Import a "non-blocky" file into the repository /// /// This is used for small files like the control file, twophase files etc. that /// are just slurped into the repository as one blob. /// -fn import_nonrel_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, - tag: RelishTag, +fn import_relmap_file( + timeline: &mut DatadirTimelineWriter, + spcnode: Oid, + dbnode: Oid, + path: &Path, +) -> Result<()> { + let mut file = File::open(path)?; + let mut buffer = Vec::new(); + // read the whole file + file.read_to_end(&mut buffer)?; + + trace!("importing relmap file {}", path.display()); + + timeline.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?; + Ok(()) +} + +fn import_twophase_file( + timeline: &mut DatadirTimelineWriter, + xid: TransactionId, path: &Path, ) -> Result<()> { let mut file = File::open(path)?; @@ -226,7 +251,7 @@ fn import_nonrel_file( trace!("importing non-rel file {}", path.display()); - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?; + timeline.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?; Ok(()) } @@ -235,9 +260,8 @@ fn import_nonrel_file( /// /// The control file is imported as is, but we also extract the checkpoint record /// from it and store it separated. -fn import_control_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, +fn import_control_file( + timeline: &mut DatadirTimelineWriter, path: &Path, ) -> Result { let mut file = File::open(path)?; @@ -248,17 +272,12 @@ fn import_control_file( trace!("importing control file {}", path.display()); // Import it as ControlFile - timeline.put_page_image( - RelishTag::ControlFile, - 0, - lsn, - Bytes::copy_from_slice(&buffer[..]), - )?; + timeline.put_control_file(Bytes::copy_from_slice(&buffer[..]))?; // Extract the checkpoint record and import it separately. let pg_control = ControlFileData::decode(&buffer)?; let checkpoint_bytes = pg_control.checkPointCopy.encode(); - timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?; + timeline.put_checkpoint(checkpoint_bytes)?; Ok(pg_control) } @@ -266,30 +285,31 @@ fn import_control_file( /// /// Import an SLRU segment file /// -fn import_slru_file( - timeline: &dyn TimelineWriter, - lsn: Lsn, +fn import_slru_file( + timeline: &mut DatadirTimelineWriter, slru: SlruKind, path: &Path, ) -> Result<()> { - // Does it look like an SLRU file? + trace!("importing slru file {}", path.display()); + let mut file = File::open(path)?; let mut buf: [u8; 8192] = [0u8; 8192]; let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?; - trace!("importing slru file {}", path.display()); + let len = file.metadata().unwrap().len(); + ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ + let nblocks = len / pg_constants::BLCKSZ as u64; + + ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64); + + timeline.put_slru_segment_creation(slru, segno, nblocks as u32)?; let mut rpageno = 0; loop { let r = file.read_exact(&mut buf); match r { Ok(_) => { - timeline.put_page_image( - RelishTag::Slru { slru, segno }, - rpageno, - lsn, - Bytes::copy_from_slice(&buf), - )?; + timeline.put_slru_page_image(slru, segno, rpageno, Bytes::copy_from_slice(&buf))?; } // TODO: UnexpectedEof is expected @@ -305,19 +325,17 @@ fn import_slru_file( }, }; rpageno += 1; - - // TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages } + ensure!(rpageno == nblocks as u32); Ok(()) } /// Scan PostgreSQL WAL files in given directory and load all records between /// 'startpoint' and 'endpoint' into the repository. -fn import_wal( +fn import_wal( walpath: &Path, - timeline: &T, - writer: &dyn TimelineWriter, + tline: &DatadirTimeline, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { @@ -327,7 +345,7 @@ fn import_wal( let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE); let mut last_lsn = startpoint; - let mut walingest = WalIngest::new(timeline, startpoint)?; + let mut walingest = WalIngest::new(tline, startpoint)?; while last_lsn <= endpoint { // FIXME: assume postgresql tli 1 for now @@ -360,7 +378,7 @@ fn import_wal( let mut nrecords = 0; while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { - walingest.ingest_record(timeline, writer, recdata, lsn)?; + walingest.ingest_record(tline, recdata, lsn)?; last_lsn = lsn; nrecords += 1; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 909477b722..e344a79373 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -14,32 +14,31 @@ use anyhow::{bail, ensure, Context, Result}; use bookfile::Book; use bytes::Bytes; +use fail::fail_point; +use itertools::Itertools; use lazy_static::lazy_static; -use postgres_ffi::pg_constants::BLCKSZ; use tracing::*; -use std::cmp; +use std::cmp::{min, max, Ordering}; use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::collections::{BTreeSet, HashSet}; +use std::collections::BTreeSet; +use std::collections::{HashMap, HashSet}; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; -use std::ops::{Bound::Included, Deref}; +use std::ops::{Bound::Included, Deref, Range}; use std::path::{Path, PathBuf}; -use std::sync::atomic::{self, AtomicBool, AtomicUsize}; +use std::sync::atomic::{self, AtomicBool}; use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard}; use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; -use crate::page_cache; -use crate::relish::*; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ - BlockNumber, GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, - TimelineWriter, ZenithWalRecord, + GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, }; +use crate::repository::{Key, Value}; use crate::thread_mgr; use crate::virtual_file::VirtualFile; use crate::walreceiver::IS_WAL_RECEIVER; @@ -47,9 +46,6 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; use zenith_metrics::{register_histogram_vec, HistogramVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; @@ -58,30 +54,25 @@ use zenith_utils::seqwait::SeqWait; mod delta_layer; mod ephemeral_file; mod filename; -mod global_layer_map; mod image_layer; mod inmemory_layer; -mod interval_tree; mod layer_map; pub mod metadata; mod par_fsync; mod storage_layer; +mod utils; -use delta_layer::DeltaLayer; +use delta_layer::{DeltaLayer, DeltaLayerWriter}; use ephemeral_file::is_ephemeral_file; use filename::{DeltaFileName, ImageFileName}; -use image_layer::ImageLayer; +use image_layer::{ImageLayer, ImageLayerWriter}; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; -use storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, RELISH_SEG_SIZE, -}; +use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState, TARGET_FILE_SIZE,TARGET_FILE_SIZE_BYTES}; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - // Metrics collected on operations on the storage repository. lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( @@ -92,26 +83,6 @@ lazy_static! { .expect("failed to define a metric"); } -// Metrics collected on operations on the storage repository. -lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( - "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" - ) - .expect("failed to define a metric"); -} - -lazy_static! { - // NOTE: can be zero if pageserver was restarted and there hasn't been any - // activity yet. - static ref LOGICAL_TIMELINE_SIZE: IntGaugeVec = register_int_gauge_vec!( - "pageserver_logical_timeline_size", - "Logical timeline size (bytes)", - &["tenant_id", "timeline_id"] - ) - .expect("failed to define a metric"); -} - /// Parts of the `.zenith/tenants//timelines/` directory prefix. pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; @@ -174,9 +145,9 @@ impl Repository for LayeredRepository { timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - 0, self.upload_relishes, ); + timeline.layers.lock().unwrap().next_open_layer_at = Some(initdb_lsn); let timeline_rc = Arc::new(timeline); let r = timelines.insert(timelineid, LayeredTimelineEntry::Local(timeline_rc.clone())); @@ -402,9 +373,9 @@ impl LayeredTimelineEntry { } /// Gets local timeline data, if it's present. Otherwise schedules a download fot the remote timeline and returns `None`. - fn local_or_schedule_download(&self, tenant_id: ZTenantId) -> Option<&LayeredTimeline> { + fn local_or_schedule_download(&self, tenant_id: ZTenantId) -> Option> { match self { - Self::Local(local) => Some(local.as_ref()), + Self::Local(local) => Some(Arc::clone(local)), Self::Remote { id: timeline_id, .. } => { @@ -471,20 +442,18 @@ impl LayeredRepository { let _enter = info_span!("loading timeline", timeline = %timelineid, tenant = %self.tenantid) .entered(); - let mut timeline = LayeredTimeline::new( + let timeline = LayeredTimeline::new( self.conf, metadata, ancestor, timelineid, self.tenantid, Arc::clone(&self.walredo_mgr), - 0, // init with 0 and update after layers are loaded, self.upload_relishes, ); timeline .load_layer_map(disk_consistent_lsn) .context("failed to load layermap")?; - timeline.init_current_logical_size()?; Ok(Arc::new(timeline)) } @@ -693,7 +662,8 @@ impl LayeredRepository { timeline.checkpoint(CheckpointConfig::Forced)?; info!("timeline {} checkpoint_before_gc done", timelineid); } - let result = timeline.gc_timeline(branchpoints, cutoff)?; + timeline.update_gc_info(branchpoints, cutoff); + let result = timeline.gc()?; totals += result; timelines = self.timelines.lock().unwrap(); @@ -745,25 +715,6 @@ pub struct LayeredTimeline { ancestor_timeline: Option, ancestor_lsn: Lsn, - // this variable indicates how much space is used from user's point of view, - // e.g. we do not account here for multiple versions of data and so on. - // this is counted incrementally based on physical relishes (excluding FileNodeMap) - // current_logical_size is not stored no disk and initialized on timeline creation using - // get_current_logical_size_non_incremental in init_current_logical_size - // this is needed because when we save it in metadata it can become out of sync - // because current_logical_size is consistent on last_record_lsn, not ondisk_consistent_lsn - // NOTE: current_logical_size also includes size of the ancestor - current_logical_size: AtomicUsize, // bytes - - // To avoid calling .with_label_values and formatting the tenant and timeline IDs to strings - // every time the logical size is updated, keep a direct reference to the Gauge here. - // unfortunately it doesnt forward atomic methods like .fetch_add - // so use two fields: actual size and metric - // see https://github.com/zenithdb/zenith/issues/622 for discussion - // TODO: it is possible to combine these two fields into single one using custom metric which uses SeqCst - // ordering for its operations, but involves private modules, and macro trickery - current_logical_size_gauge: IntGauge, - /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -783,6 +734,10 @@ pub struct LayeredTimeline { // Needed to ensure that we can't create a branch at a point that was already garbage collected latest_gc_cutoff_lsn: RwLock, + // List of child timelines and their branch points. This is needed to avoid + // garbage collecting data that is still needed by the child timelines. + gc_info: RwLock, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -792,6 +747,11 @@ pub struct LayeredTimeline { initdb_lsn: Lsn, } +struct GcInfo { + retain_lsns: Vec, + cutoff: Lsn, +} + /// Public interface functions impl Timeline for LayeredTimeline { fn get_ancestor_lsn(&self) -> Lsn { @@ -829,163 +789,21 @@ impl Timeline for LayeredTimeline { self.latest_gc_cutoff_lsn.read().unwrap() } - /// Look up given page version. - fn get_page_at_lsn(&self, rel: RelishTag, rel_blknum: BlockNumber, lsn: Lsn) -> Result { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - debug_assert!(lsn <= self.get_last_record_lsn()); - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) - } else { - // FIXME: This can happen if PostgreSQL extends a relation but never writes - // the page. See https://github.com/zenithdb/zenith/issues/841 - // - // Would be nice to detect that situation better. - if seg.segno > 0 && self.get_rel_exists(rel, lsn)? { - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - - bail!("segment {} not found at {}", rel, lsn); - } - } - - fn get_relish_size(&self, rel: RelishTag, lsn: Lsn) -> Result> { - if !rel.is_blocky() { - bail!( - "invalid get_relish_size request for non-blocky relish {}", - rel - ); - } + /// Look up the value with the given a key + fn get(&self, key: Key, lsn: Lsn) -> Result { debug_assert!(lsn <= self.get_last_record_lsn()); - let mut segno = 0; - loop { - let seg = SegmentTag { rel, segno }; - - let segsize; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - segsize = layer.get_seg_size(lsn)?; - trace!("get_seg_size: {} at {} -> {}", seg, lsn, segsize); - } else { - if segno == 0 { - return Ok(None); - } - segsize = 0; - } - - if segsize != RELISH_SEG_SIZE { - let result = segno * RELISH_SEG_SIZE + segsize; - return Ok(Some(result)); - } - segno += 1; - } - } - - fn get_rel_exists(&self, rel: RelishTag, lsn: Lsn) -> Result { - debug_assert!(lsn <= self.get_last_record_lsn()); - - let seg = SegmentTag { rel, segno: 0 }; - - let result = if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - layer.get_seg_exists(lsn)? - } else { - false + let mut reconstruct_state = ValueReconstructState { + key, + lsn, + records: Vec::new(), + img: None, // FIXME: check page cache and put the img here + request_lsn: lsn, }; - trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); - Ok(result) - } + self.get_reconstruct_data(&mut reconstruct_state)?; - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result> { - let request_tag = RelTag { - spcnode, - dbnode, - relnode: 0, - forknum: 0, - }; - - self.list_relishes(Some(request_tag), lsn) - } - - fn list_nonrels(&self, lsn: Lsn) -> Result> { - info!("list_nonrels called at {}", lsn); - - self.list_relishes(None, lsn) - } - - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - trace!("list_relishes called at {}", lsn); - debug_assert!(lsn <= self.get_last_record_lsn()); - - // List of all relishes along with a flag that marks if they exist at the given lsn. - let mut all_relishes_map: HashMap = HashMap::new(); - let mut result = HashSet::new(); - let mut timeline = self; - - // Iterate through layers back in time and find the most - // recent state of the relish. Don't add relish to the list - // if newer version is already there. - // - // This most recent version can represent dropped or existing relish. - // We will filter dropped relishes below. - // - loop { - let rels = timeline.layers.lock().unwrap().list_relishes(tag, lsn)?; - - for (&new_relish, &new_relish_exists) in rels.iter() { - match all_relishes_map.entry(new_relish) { - Entry::Occupied(o) => { - trace!( - "Newer version of the object {} is already found: exists {}", - new_relish, - o.get(), - ); - } - Entry::Vacant(v) => { - v.insert(new_relish_exists); - trace!( - "Newer version of the object {} NOT found. Insert NEW: exists {}", - new_relish, - new_relish_exists - ); - } - } - } - - match &timeline.ancestor_timeline { - None => break, - Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - timeline = ancestor; - continue; - } - None => bail!("Cannot list relishes for timeline {} tenant {} due to its ancestor being remote only", self.timelineid, self.tenantid), - } - } - } - } - - // Filter out dropped relishes - for (&new_relish, &new_relish_exists) in all_relishes_map.iter() { - if new_relish_exists { - result.insert(new_relish); - trace!("List object {}", new_relish); - } else { - trace!("Filtered out dropped object {}", new_relish); - } - } - - Ok(result) + self.reconstruct_value(key, lsn, reconstruct_state) } /// Public entry point for checkpoint(). All the logic is in the private @@ -1005,6 +823,11 @@ impl Timeline for LayeredTimeline { } } + // Entry point for forced image creation. Only used by tests at the moment. + fn create_images(&self, threshold: usize) -> Result<()> { + self.create_image_layers(threshold) + } + /// /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. /// @@ -1034,37 +857,6 @@ impl Timeline for LayeredTimeline { self.last_record_lsn.load() } - fn get_current_logical_size(&self) -> usize { - self.current_logical_size.load(atomic::Ordering::Acquire) as usize - } - - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { - let mut total_blocks: usize = 0; - - let _enter = info_span!("calc logical size", %lsn).entered(); - - // list of all relations in this timeline, including ancestor timelines - let all_rels = self.list_rels(0, 0, lsn)?; - - for rel in all_rels { - if let Some(size) = self.get_relish_size(rel, lsn)? { - total_blocks += size as usize; - } - } - - let non_rels = self.list_nonrels(lsn)?; - for non_rel in non_rels { - // TODO support TwoPhase - if matches!(non_rel, RelishTag::Slru { slru: _, segno: _ }) { - if let Some(size) = self.get_relish_size(non_rel, lsn)? { - total_blocks += size as usize; - } - } - } - - Ok(total_blocks * BLCKSZ as usize) - } - fn get_disk_consistent_lsn(&self) -> Lsn { self.disk_consistent_lsn.load() } @@ -1089,12 +881,8 @@ impl LayeredTimeline { timelineid: ZTimelineId, tenantid: ZTenantId, walredo_mgr: Arc, - current_logical_size: usize, upload_relishes: bool, ) -> LayeredTimeline { - let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE - .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) - .unwrap(); LayeredTimeline { conf, timelineid, @@ -1112,13 +900,16 @@ impl LayeredTimeline { ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), - current_logical_size: AtomicUsize::new(current_logical_size), - current_logical_size_gauge, upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), checkpoint_cs: Mutex::new(()), + gc_info: RwLock::new(GcInfo { + retain_lsns: Vec::new(), + cutoff: Lsn(0), + }), + latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), } @@ -1161,13 +952,12 @@ impl LayeredTimeline { num_layers += 1; } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) { // Create a DeltaLayer struct for each delta file. - ensure!(deltafilename.start_lsn < deltafilename.end_lsn); // The end-LSN is exclusive, while disk_consistent_lsn is // inclusive. For example, if disk_consistent_lsn is 100, it is // OK for a delta layer to have end LSN 101, but if the end LSN // is 102, then it might not have been fully flushed to disk // before crash. - if deltafilename.end_lsn > disk_consistent_lsn + 1 { + if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { warn!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", deltafilename, self.timelineid, disk_consistent_lsn @@ -1194,41 +984,14 @@ impl LayeredTimeline { } } - info!("loaded layer map with {} layers", num_layers); + layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1); - Ok(()) - } - - /// - /// Used to init current logical size on startup - /// - fn init_current_logical_size(&mut self) -> Result<()> { - if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 { - bail!("cannot init already initialized current logical size") - }; - let lsn = self.get_last_record_lsn(); - self.current_logical_size = - AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?); - trace!( - "current_logical_size initialized to {}", - self.current_logical_size.load(atomic::Ordering::Relaxed) + info!( + "loaded layer map with {} layers at {}", + num_layers, disk_consistent_lsn ); - Ok(()) - } - /// - /// Get a handle to a Layer for reading. - /// - /// The returned Layer might be from an ancestor timeline, if the - /// segment hasn't been updated on this timeline yet. - /// - fn get_layer_for_read( - &self, - seg: SegmentTag, - lsn: Lsn, - ) -> Result, Lsn)>> { - let self_layers = self.layers.lock().unwrap(); - self.get_layer_for_read_locked(seg, lsn, &self_layers) + Ok(()) } /// @@ -1239,97 +1002,115 @@ impl LayeredTimeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. - fn get_layer_for_read_locked( - &self, - seg: SegmentTag, - lsn: Lsn, - self_layers: &MutexGuard, - ) -> Result, Lsn)>> { - trace!("get_layer_for_read called for {} at {}", seg, lsn); - - // If you requested a page at an older LSN, before the branch point, dig into - // the right ancestor timeline. This can only happen if you launch a read-only - // node with an old LSN, a primary always uses a recent LSN in its requests. + fn get_reconstruct_data(&self, reconstruct_state: &mut ValueReconstructState) -> Result<()> { + // Start from the current timeline. + let mut timeline_owned; let mut timeline = self; - let mut lsn = lsn; - while lsn < timeline.ancestor_lsn { - trace!("going into ancestor {} ", timeline.ancestor_lsn); - timeline = match timeline - .ancestor_timeline - .as_ref() - .and_then(|ancestor_entry| ancestor_entry.local_or_schedule_download(self.tenantid)) - { - Some(timeline) => timeline, - None => { + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used + // to check that each iteration make some progress, to break infinite + // looping if something goes wrong. + let mut prev_lsn = Lsn(u64::MAX); + + let mut result = ValueReconstructResult::Continue; + + loop { + // The function should have updated 'state' + //info!("CALLED for {} at {}: {:?} with {} records", reconstruct_state.key, reconstruct_state.lsn, result, reconstruct_state.records.len()); + match result { + ValueReconstructResult::Complete => return Ok(()), + ValueReconstructResult::Continue => { + if prev_lsn <= reconstruct_state.lsn { + // Didn't make any progress in last iteration. Error out to avoid + // getting stuck in the loop. + bail!("could not find layer with more data for key {} at LSN {}, request LSN {}", + reconstruct_state.key, + reconstruct_state.lsn, + reconstruct_state.request_lsn) + } + prev_lsn = reconstruct_state.lsn; + } + ValueReconstructResult::Missing => { bail!( - "Cannot get the whole layer for read locked: timeline {} is not present locally", - self.timelineid + "could not find data for key {} at LSN {}, for request at LSN {}", + reconstruct_state.key, + reconstruct_state.lsn, + reconstruct_state.request_lsn ) } - }; - } + } - // Now we have the right starting timeline for our search. - loop { - let layers_owned: MutexGuard; - let layers = if self as *const LayeredTimeline != timeline as *const LayeredTimeline { - layers_owned = timeline.layers.lock().unwrap(); - &layers_owned + // Recurse into ancestor if needed + if reconstruct_state.lsn <= timeline.ancestor_lsn { + //info!("going into ancestor {}", timeline.ancestor_lsn); + let ancestor = timeline.get_ancestor_timeline()?; + timeline_owned = ancestor; + timeline = &*timeline_owned; + prev_lsn = Lsn(u64::MAX); + continue; + } + + let layers = timeline.layers.lock().unwrap(); + + // Check the open and frozen in-memory layers first + if let Some(open_layer) = &layers.open_layer { + let start_lsn = open_layer.get_lsn_range().start; + if reconstruct_state.lsn >= start_lsn { + //info!("CHECKING for {} at {} on open layer {}", reconstruct_state.key, reconstruct_state.lsn, open_layer.filename().display()); + result = open_layer.get_value_reconstruct_data(open_layer.get_lsn_range().start, reconstruct_state)?; + continue; + } + } + if let Some(frozen_layer) = &layers.frozen_layer { + let start_lsn = frozen_layer.get_lsn_range().start; + if reconstruct_state.lsn >= start_lsn { + //info!("CHECKING for {} at {} on frozen layer {}", reconstruct_state.key, reconstruct_state.lsn, frozen_layer.filename().display()); + result = frozen_layer.get_value_reconstruct_data(frozen_layer.get_lsn_range().start, reconstruct_state)?; + continue; + } + } + + if let Some(search_result) = layers + .search(reconstruct_state.key, reconstruct_state.lsn)? + { + //info!("CHECKING for {} at {} on historic layer {}", reconstruct_state.key, reconstruct_state.lsn, layer.filename().display()); + + result = search_result + .layer + .get_value_reconstruct_data(search_result.lsn_floor, reconstruct_state)?; + } else if self.ancestor_timeline.is_some() { + // Nothing on this timeline. Traverse to parent + result = ValueReconstructResult::Continue; + reconstruct_state.lsn = self.ancestor_lsn; } else { - self_layers - }; - - // - // FIXME: If the relation has been dropped, does this return the right - // thing? The compute node should not normally request dropped relations, - // but if OID wraparound happens the same relfilenode might get reused - // for an unrelated relation. - // - - // Do we have a layer on this timeline? - if let Some(layer) = layers.get(&seg, lsn) { - trace!( - "found layer in cache: {} {}-{}", - timeline.timelineid, - layer.get_start_lsn(), - layer.get_end_lsn() - ); - - assert!(layer.get_start_lsn() <= lsn); - - if layer.is_dropped() && layer.get_end_lsn() <= lsn { - return Ok(None); - } - - return Ok(Some((layer.clone(), lsn))); - } - - // If not, check if there's a layer on the ancestor timeline - match &timeline.ancestor_timeline { - Some(ancestor_entry) => { - match ancestor_entry.local_or_schedule_download(self.tenantid) { - Some(ancestor) => { - lsn = timeline.ancestor_lsn; - timeline = ancestor; - trace!("recursing into ancestor at {}/{}", timeline.timelineid, lsn); - continue; - } - None => bail!( - "Cannot get a layer for read from remote ancestor timeline {}", - self.timelineid - ), - } - } - None => return Ok(None), + // Nothing found + result = ValueReconstructResult::Missing; } } } + fn get_ancestor_timeline(&self) -> Result> { + let ancestor_entry = self + .ancestor_timeline + .as_ref() + .expect("get_ancestor_timeline() called on timeline with no parent"); + + let timeline = match ancestor_entry.local_or_schedule_download(self.tenantid) { + Some(timeline) => timeline, + None => { + bail!( + "Cannot get the whole layer for read locked: ancestor of timeline {} is not present locally", + self.timelineid + ) + } + }; + Ok(timeline) + } + /// /// Get a handle to the latest layer for appending. /// - fn get_layer_for_write(&self, seg: SegmentTag, lsn: Lsn) -> Result> { + fn get_layer_for_write(&self, lsn: Lsn) -> Result> { let mut layers = self.layers.lock().unwrap(); assert!(lsn.is_aligned()); @@ -1344,96 +1125,47 @@ impl LayeredTimeline { // Do we have a layer open for writing already? let layer; - if let Some(open_layer) = layers.get_open(&seg) { - if open_layer.get_start_lsn() > lsn { + if let Some(open_layer) = &layers.open_layer { + if open_layer.get_lsn_range().start > lsn { bail!("unexpected open layer in the future"); } - // Open layer exists, but it is dropped, so create a new one. - if open_layer.is_dropped() { - assert!(!open_layer.is_writeable()); - // Layer that is created after dropped one represents a new relish segment. - trace!( - "creating layer for write for new relish segment after dropped layer {} at {}/{}", - seg, - self.timelineid, - lsn - ); - - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; - } else { - return Ok(open_layer); - } - } - // No writeable layer for this relation. Create one. - // - // Is this a completely new relation? Or the first modification after branching? - // - else if let Some((prev_layer, _prev_lsn)) = - self.get_layer_for_read_locked(seg, lsn, &layers)? - { - // Create new entry after the previous one. - let start_lsn; - if prev_layer.get_timeline_id() != self.timelineid { - // First modification on this timeline - start_lsn = self.ancestor_lsn + 1; - trace!( - "creating layer for write for {} at branch point {}", - seg, - start_lsn - ); - } else { - start_lsn = prev_layer.get_end_lsn(); - trace!( - "creating layer for write for {} after previous layer {}", - seg, - start_lsn - ); - } - trace!( - "prev layer is at {}/{} - {}", - prev_layer.get_timeline_id(), - prev_layer.get_start_lsn(), - prev_layer.get_end_lsn() - ); - layer = InMemoryLayer::create_successor_layer( - self.conf, - prev_layer, - self.timelineid, - self.tenantid, - start_lsn, - last_record_lsn, - )?; + layer = Arc::clone(open_layer); } else { - // New relation. + // No writeable layer yet. Create one. + let start_lsn = layers.next_open_layer_at.unwrap(); + trace!( - "creating layer for write for new rel {} at {}/{}", - seg, + "creating layer for write at {}/{} for record at {}", self.timelineid, + start_lsn, lsn ); + let new_layer = + InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn, lsn)?; + let layer_rc = Arc::new(new_layer); - layer = InMemoryLayer::create( - self.conf, - self.timelineid, - self.tenantid, - seg, - lsn, - last_record_lsn, - )?; + layers.open_layer = Some(Arc::clone(&layer_rc)); + layers.next_open_layer_at = None; + + layer = layer_rc; } + Ok(layer) + } - let layer_rc: Arc = Arc::new(layer); - layers.insert_open(Arc::clone(&layer_rc)); + fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + //info!("PUT: key {} at {}", key, lsn); + let layer = self.get_layer_for_write(lsn)?; + layer.put_value(key, lsn, val)?; - Ok(layer_rc) + Ok(()) + } + + fn put_tombstone(&self, key_range: Range, lsn: Lsn) -> Result<()> { + let layer = self.get_layer_for_write(lsn)?; + layer.put_tombstone(key_range, lsn)?; + + Ok(()) } /// @@ -1441,126 +1173,108 @@ impl LayeredTimeline { /// /// NOTE: This has nothing to do with checkpoint in PostgreSQL. fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { + info!("checkpoint starting"); // Prevent concurrent checkpoints let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - let write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); - - // Bump the generation number in the layer map, so that we can distinguish - // entries inserted after the checkpoint started - let current_generation = layers.increment_generation(); - - let RecordLsn { - last: last_record_lsn, - prev: prev_record_lsn, - } = self.last_record_lsn.load(); - - trace!("checkpoint starting at {}", last_record_lsn); - - // Take the in-memory layer with the oldest WAL record. If it's older - // than the threshold, write it out to disk as a new image and delta file. - // Repeat until all remaining in-memory layers are within the threshold. + // If the in-memory layer is larger than 'checkpoint_distance', write it + // to a delta file. That's necessary to limit the amount of WAL that + // needs to be kept in the safekeepers, and that needs to be reprocessed + // on page server crash. // - // That's necessary to limit the amount of WAL that needs to be kept - // in the safekeepers, and that needs to be reprocessed on page server - // crash. TODO: It's not a great policy for keeping memory usage in - // check, though. We should also aim at flushing layers that consume - // a lot of memory and/or aren't receiving much updates anymore. - let mut disk_consistent_lsn = last_record_lsn; + // TODO: It's not a great policy for keeping memory usage in check, + // though. We should also aim at flushing layers that consume a lot of + // memory and/or aren't receiving much updates anymore. + loop { + // Do we have a frozen in-memory layer that we need to write out? + // If we do, write it out now. Otherwise, check if the current + // in-memory layer is old enough that we should freeze and write it out. + let write_guard = self.write_lock.lock().unwrap(); + let mut layers = self.layers.lock().unwrap(); + if let Some(frozen_layer) = &layers.frozen_layer { + // Write out the frozen in-memory layer to disk, as a delta file + let frozen_layer = Arc::clone(frozen_layer); + drop(write_guard); + drop(layers); + self.flush_frozen_layer(frozen_layer)?; + } else { + // Freeze the current open in-memory layer, if it's larger than + // 'checkpoint_distance'. It will be written to disk on next + // iteration. + if let Some(open_layer) = &layers.open_layer { + // Does this layer need freezing? + let RecordLsn { + last: last_record_lsn, + prev: _prev_record_lsn, + } = self.last_record_lsn.load(); + let oldest_lsn = open_layer.get_oldest_lsn(); + let distance = last_record_lsn.widening_sub(oldest_lsn); + if distance < 0 || distance < checkpoint_distance.into() { + info!( + "the oldest layer is now {} which is {} bytes behind last_record_lsn", + open_layer.filename().display(), + distance + ); + break; + } + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); - let mut layer_paths = Vec::new(); - let mut freeze_end_lsn = Lsn(0); - let mut evicted_layers = Vec::new(); - - // - // Determine which layers we need to evict and calculate max(latest_lsn) - // among those layers. - // - while let Some((oldest_layer_id, oldest_layer, oldest_generation)) = - layers.peek_oldest_open() - { - let oldest_lsn = oldest_layer.get_oldest_lsn(); - // Does this layer need freezing? - // - // Write out all in-memory layers that contain WAL older than CHECKPOINT_DISTANCE. - // If we reach a layer with the same - // generation number, we know that we have cycled through all layers that were open - // when we started. We don't want to process layers inserted after we started, to - // avoid getting into an infinite loop trying to process again entries that we - // inserted ourselves. - // - // Once we have decided to write out at least one layer, we must also write out - // any other layers that contain WAL older than the end LSN of the layers we have - // already decided to write out. In other words, we must write out all layers - // whose [oldest_lsn, latest_lsn) range overlaps with any of the other layers - // that we are writing out. Otherwise, when we advance 'disk_consistent_lsn', it's - // ambiguous whether those layers are already durable on disk or not. For example, - // imagine that there are two layers in memory that contain page versions in the - // following LSN ranges: - // - // A: 100-150 - // B: 110-200 - // - // If we flush layer A, we must also flush layer B, because they overlap. If we - // flushed only A, and advanced 'disk_consistent_lsn' to 150, we would break the - // rule that all WAL older than 'disk_consistent_lsn' are durable on disk, because - // B contains some WAL older than 150. On the other hand, if we flushed out A and - // advanced 'disk_consistent_lsn' only up to 110, after crash and restart we would - // delete the first layer because its end LSN is larger than 110. If we changed - // the deletion logic to not delete it, then we would start streaming at 110, and - // process again the WAL records in the range 110-150 that are already in layer A, - // and the WAL processing code does not cope with that. We solve that dilemma by - // insisting that if we write out the first layer, we also write out the second - // layer, and advance disk_consistent_lsn all the way up to 200. - // - let distance = last_record_lsn.widening_sub(oldest_lsn); - if (distance < 0 - || distance < checkpoint_distance.into() - || oldest_generation == current_generation) - && oldest_lsn >= freeze_end_lsn - // this layer intersects with evicted layer and so also need to be evicted - { - info!( - "the oldest layer is now {} which is {} bytes behind last_record_lsn", - oldest_layer.filename().display(), - distance - ); - disk_consistent_lsn = oldest_lsn; - break; + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layer = Some(Arc::clone(open_layer)); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + } else { + break; + } + // We will write the now-frozen layer to disk on next iteration. + // That could take a while, so release the lock while do it + drop(layers); + drop(write_guard); } - let latest_lsn = oldest_layer.get_latest_lsn(); - if latest_lsn > freeze_end_lsn { - freeze_end_lsn = latest_lsn; // calculate max of latest_lsn of the layers we're about to evict - } - layers.remove_open(oldest_layer_id); - evicted_layers.push((oldest_layer_id, oldest_layer)); } - // Freeze evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - // Mark the layer as no longer accepting writes and record the end_lsn. - // This happens in-place, no new layers are created now. - evicted_layer.freeze(freeze_end_lsn); - layers.insert_historic(evicted_layer.clone()); + // Create new image layers to allow GC and to reduce read latency + if reconstruct_pages { + // TODO: the threshold for how often we create image layers is + // currently hard-coded at 3. It means, write out a new image layer, + // if there are at least three delta layers on top of it. + if false { + self.create_image_layers(3)?; + } + self.compact_level0()?; } + // TODO: We should also compact existing delta layers here. + // Call unload() on all frozen layers, to release memory. // This shouldn't be much memory, as only metadata is slurped // into memory. + let layers = self.layers.lock().unwrap(); for layer in layers.iter_historic_layers() { layer.unload()?; } - drop(layers); - drop(write_guard); - // Create delta/image layers for evicted layers - for (_evicted_layer_id, evicted_layer) in evicted_layers.iter() { - let mut this_layer_paths = - self.evict_layer(evicted_layer.clone(), reconstruct_pages)?; - layer_paths.append(&mut this_layer_paths); - } + Ok(()) + } + + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { + // Do we have a frozen in-memory layer that we need to write out? + let new_delta = frozen_layer.write_to_disk()?; + + // Finally, replace the frozen in-memory layer with the new on-disk layers + let write_guard = self.write_lock.lock().unwrap(); + let mut layers = self.layers.lock().unwrap(); + layers.frozen_layer = None; + + // Add the new delta layer to the LayerMap + let mut layer_paths = vec![new_delta.path()]; + layers.insert_historic(Arc::new(new_delta)); + + drop(write_guard); + drop(layers); // Sync layers if !layer_paths.is_empty() { @@ -1575,6 +1289,10 @@ impl LayeredTimeline { layer_paths.pop().unwrap(); } + // Compute new 'disk_consistent_lsn' + let disk_consistent_lsn; + disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); + // If we were able to advance 'disk_consistent_lsn', save it the metadata file. // After crash, we will restart WAL streaming and processing from that point. let old_disk_consistent_lsn = self.disk_consistent_lsn.load(); @@ -1587,6 +1305,10 @@ impl LayeredTimeline { // don't remember what the correct value that corresponds to some old // LSN is. But if we flush everything, then the value corresponding // current 'last_record_lsn' is correct and we can store it on disk. + let RecordLsn { + last: last_record_lsn, + prev: prev_record_lsn, + } = self.last_record_lsn.load(); let ondisk_prev_record_lsn = if disk_consistent_lsn == last_record_lsn { Some(prev_record_lsn) } else { @@ -1607,6 +1329,11 @@ impl LayeredTimeline { self.initdb_lsn, ); + fail_point!("checkpoint-before-saving-metadata", |x| bail!( + "{}", + x.unwrap() + )); + LayeredRepository::save_metadata( self.conf, self.timelineid, @@ -1630,30 +1357,321 @@ impl LayeredTimeline { Ok(()) } - fn evict_layer( - &self, - layer: Arc, - reconstruct_pages: bool, - ) -> Result> { - let new_historics = layer.write_to_disk(self, reconstruct_pages)?; + fn compact(&self) -> Result<()> { + // + // High level strategy for compaction / image creation: + // + // 1. First, calculate the desired "partitioning" of the + // currently in-use key space. The goal is to partition the + // key space into TARGET_FILE_SIZE chunks, but also take into + // account any existing image layers, and try to align the + // chunk boundaries with the existing image layers to avoid + // too much churn. Also try to align chunk boundaries with + // relation boundaries. In principle, we don't know about + // relation boundaries here, we just deal with key-value + // pairs, and the code in pgdatadir_mapping.rs knows how to + // map relations into key-value pairs. But in practice we know + // that 'field6' is the block number, and the fields 1-5 + // identify a relation. This is just an optimization, + // though. + // + // 2. Once we know the partitioning, for each partition, + // decide if it's time to create a new image layer. The + // criteria is: there has been too much "churn" since the last + // image layer? The "churn" is fuzzy concept, it's a + // combination of too many delta files, or too much WAL in + // total in the delta file. Or perhaps: if creating an image + // file would allow to delete some older files. + // + // 3. After that, we compact all level0 delta files if there + // are too many of them. While compacting, we also garbage + // collect any page versions that are no longer needed because + // of the new image layers we created in step 2. + // + // TODO: This hight level strategy hasn't been implemented yet. + // Below are functions compact_level0() and create_image_layers() + // but they are a bit ad hoc and don't quite work like it's explained + // above. Rewrite it. + todo!() + } - let mut layer_paths = Vec::new(); - let _write_guard = self.write_lock.lock().unwrap(); + fn compact_level0(&self) -> Result<()> { let mut layers = self.layers.lock().unwrap(); - // Finally, replace the frozen in-memory layer with the new on-disk layers - layers.remove_historic(layer); + // We compact or "shuffle" the level-0 delta layers when 10 have + // accumulated. + static COMPACT_THRESHOLD: usize = 10; - // Add the historics to the LayerMap - for delta_layer in new_historics.delta_layers { - layer_paths.push(delta_layer.path()); - layers.insert_historic(Arc::new(delta_layer)); + let level0_deltas = layers.get_level0_deltas()?; + + if level0_deltas.len() < COMPACT_THRESHOLD { + return Ok(()); } - for image_layer in new_historics.image_layers { - layer_paths.push(image_layer.path()); + + // FIXME: this function probably won't work correctly if there's overlap + // in the deltas. + let lsn_range = level0_deltas.iter().map(|l| l.get_lsn_range()).reduce(|a, b| { + min(a.start, b.start)..max(a.end, b.end) + }).unwrap(); + + let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| { + if let Ok((a_key, a_lsn, _)) = a { + if let Ok((b_key, b_lsn, _)) = b { + match a_key.cmp(b_key) { + Ordering::Less => true, + Ordering::Equal => { + a_lsn <= b_lsn + } + Ordering::Greater => false, + } + } else { + false + } + } else { + true + } + }); + + // Merge the contents of all the input delta layers into a new set + // of delta layers. Each output layer is TARGET_FILE_SIZE_BYTES in + // size, i.e. we don't try to align the layer boundaries with the + // image layers or relation boundaries. TODO: we probably should, + // to allow garbage collection to happen earlier. + // + // TODO: we should also opportunistically garbage collect what we can. + let mut new_layers = Vec::new(); + let mut prev_key: Option = None; + let mut writer: Option = None; + for x in all_values_iter { + let (key, lsn, value) = x?; + + if let Some(prev_key) = prev_key { + if key != prev_key && writer.is_some() { + let size = writer.as_mut().unwrap().size(); + info!("size is now {}", size); + if size > TARGET_FILE_SIZE_BYTES as u64 { + new_layers.push(writer.take().unwrap().finish(prev_key.next())?); + writer = None; + } + } + } + + if writer.is_none() { + writer = Some(DeltaLayerWriter::new( + self.conf, + self.timelineid, + self.tenantid, + key, + lsn_range.clone(), + )?); + } + + writer.as_mut().unwrap().put_value(key, lsn, value)?; + prev_key = Some(key); + } + if let Some(writer) = writer { + new_layers.push(writer.finish(prev_key.unwrap().next())?); + } + + for l in new_layers { + layers.insert_historic(Arc::new(l)); + } + + // Now that we have reshuffled the data to set of new delta layers, we can + // delete the old ones + for l in level0_deltas { + l.delete()?; + layers.remove_historic(l.clone()); + } + + Ok(()) + } + + /// + /// Create new image layers, to allow garbage collection to remove old files. + /// + fn create_image_layers(&self, threshold: usize) -> Result<()> { + let layers = self.layers.lock().unwrap(); + let lsn = self.last_record_lsn.load().last; + let image_coverage = layers.image_coverage(&(Key::MIN..Key::MAX), lsn)?; + drop(layers); + + debug!( + "create_image_layers called with threshold {} at {}", + threshold, lsn + ); + + // For any range where there has been more than 'threshold' + // deltas on top of the last image, create new image. + // + // TODO: Invent a better heuristic. + // + // + // TODO: add heuristics to greedily include more segments in the + // image layer, if it's otherwise very small. + for (key_range, last_img) in image_coverage { + let img_lsn = if let Some(ref last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + + let layers = self.layers.lock().unwrap(); + let num_deltas = layers.get_deltas(&key_range, &(img_lsn..lsn))?.len(); + drop(layers); + + info!( + "range {}-{} has {} deltas on this timeline", + key_range.start, key_range.end, num_deltas + ); + if num_deltas >= threshold { + self.create_image_layers_for_range(&key_range, last_img, lsn)?; + } + } + + Ok(()) + } + + // Get all distinct Keys present in the given key range. + // + // This is used to figure out which parts of the overall keyspace are in use, to + // divide the keyspace into image layers. + // + // TODO: For a large database, this set could be very large. Use ranges or prefixes + // instead of individual keys. + fn collect_keys( + &self, + key_range: &Range, + img: Option>, + lsn: Lsn, + base_keys: &mut HashSet, + delta_keys: &mut HashSet, + ) -> Result<()> { + info!( + "creating image layer for key range {}-{} at {}", + key_range.start, key_range.end, lsn + ); + + let baseline_lsn = if let Some(img) = img { + // This range is covered by an image layer on this timeline. Iterate over all the keys + img.collect_keys(key_range, base_keys)?; + img.get_lsn_range().end + } else if self.ancestor_timeline.is_some() { + // Need to look at the ancestor for this range. + let ancestor = self.get_ancestor_timeline()?; + + ancestor.collect_keys_recurse(key_range, lsn, base_keys)?; + self.ancestor_lsn + } else { + self.initdb_lsn + }; + + // Ok, we have baseline list of keys from the images now + // Add all keys from all the deltas + let deltas = { + let layers = self.layers.lock().unwrap(); + layers.get_deltas(key_range, &(baseline_lsn..lsn))? + }; + + for delta in deltas { + delta.collect_keys(key_range, delta_keys)?; + } + + Ok(()) + } + + fn collect_keys_recurse( + &self, + key_range: &Range, + lsn: Lsn, + keys: &mut HashSet, + ) -> Result<()> { + let layers = self.layers.lock().unwrap(); + let image_coverage = layers.image_coverage(key_range, lsn)?; + drop(layers); + + for (range, last_img) in image_coverage { + let mut tmp_keys = HashSet::new(); + self.collect_keys(&range, last_img, lsn, keys, &mut tmp_keys)?; + keys.extend(tmp_keys); + } + + Ok(()) + } + + /// Create a new set of image layers for the given key range. + fn create_image_layers_for_range( + &self, + key_range: &Range, + img: Option>, + lsn: Lsn, + ) -> Result<()> { + info!( + "creating image layer for {}-{} at {}", + key_range.start, key_range.end, lsn + ); + + // If this gets called multiple times in a row, it's possible that the + // image layer already exists. + let layers = self.layers.lock().unwrap(); + if layers.image_exists(key_range, lsn) { + info!( + "skipping creation of image layer for {}-{} at {} because it already exists", + key_range.start, key_range.end, lsn + ); + return Ok(()); + } + drop(layers); + + let mut base_keys: HashSet = HashSet::new(); + let mut delta_keys: HashSet = HashSet::new(); + self.collect_keys(key_range, img, lsn, &mut base_keys, &mut delta_keys)?; + + if delta_keys.is_empty() { + // Important special case: even though there was delta layers on top of this + // key range, the delta layers didn't contain any updates within the range. + // In that case, if we wrote a new image, it would have identical contents, + // just stamped at a later LSN. Not much point in that. + return Ok(()); + } + + // Divide the key range into roughly TARGET_FILE_SIZE chunks + let mut all_keys_vec: Vec = + base_keys.iter().chain(delta_keys.iter()).cloned().collect(); + all_keys_vec.sort(); + all_keys_vec.dedup(); + + let mut start_idx = 0; + let mut start_key = key_range.start; + while start_idx < all_keys_vec.len() { + let end_idx = std::cmp::min(start_idx + TARGET_FILE_SIZE as usize, all_keys_vec.len()); + let end_key = if end_idx >= all_keys_vec.len() { + key_range.end + } else { + all_keys_vec[end_idx] + }; + + let img_range = start_key..end_key; + + let mut image_layer_writer = + ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; + + for key in all_keys_vec[start_idx..end_idx].iter() { + let img = self.get(*key, lsn)?; + image_layer_writer.put_image(*key, &img)?; + } + let image_layer = image_layer_writer.finish()?; + + let mut layers = self.layers.lock().unwrap(); layers.insert_historic(Arc::new(image_layer)); + drop(layers); + // FIXME: need to fsync? + + start_idx = end_idx; + start_key = end_key; } - Ok(layer_paths) + + Ok(()) } /// @@ -1678,12 +1696,22 @@ impl LayeredTimeline { /// within a layer file. We can only remove the whole file if it's fully /// obsolete. /// - pub fn gc_timeline(&self, retain_lsns: Vec, cutoff: Lsn) -> Result { + fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn) { + let mut gc_info = self.gc_info.write().unwrap(); + gc_info.retain_lsns = retain_lsns; + gc_info.cutoff = cutoff; + } + + fn gc(&self) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); let disk_consistent_lsn = self.get_disk_consistent_lsn(); let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); + let gc_info = self.gc_info.read().unwrap(); + let retain_lsns = &gc_info.retain_lsns; + let cutoff = gc_info.cutoff; + let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered(); // We need to ensure that no one branches at a point before latest_gc_cutoff_lsn. @@ -1701,8 +1729,7 @@ impl LayeredTimeline { // Garbage collect the layer if all conditions are satisfied: // 1. it is older than cutoff LSN; // 2. it doesn't need to be retained for 'retain_lsns'; - // 3. newer on-disk layer exists (only for non-dropped segments); - // 4. this layer doesn't serve as a tombstone for some older layer; + // 3. newer on-disk image layers cover the layer's whole key range // let mut layers = self.layers.lock().unwrap(); 'outer: for l in layers.iter_historic_layers() { @@ -1716,28 +1743,16 @@ impl LayeredTimeline { continue; } - let seg = l.get_seg_tag(); - - if seg.rel.is_relation() { - result.ondisk_relfiles_total += 1; - } else { - result.ondisk_nonrelfiles_total += 1; - } + result.layers_total += 1; // 1. Is it newer than cutoff point? - if l.get_end_lsn() > cutoff { + if l.get_lsn_range().end > cutoff { info!( - "keeping {} {}-{} because it's newer than cutoff {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), + "keeping {} because it's newer than cutoff {}", + l.filename().display(), cutoff ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_cutoff += 1; - } else { - result.ondisk_nonrelfiles_needed_by_cutoff += 1; - } + result.layers_needed_by_cutoff += 1; continue 'outer; } @@ -1746,132 +1761,39 @@ impl LayeredTimeline { // might be referenced by child branches forever. // We can track this in child timeline GC and delete parent layers when // they are no longer needed. This might be complicated with long inheritance chains. - for retain_lsn in &retain_lsns { + for retain_lsn in retain_lsns { // start_lsn is inclusive - if &l.get_start_lsn() <= retain_lsn { + if &l.get_lsn_range().start <= retain_lsn { info!( - "keeping {} {}-{} because it's still might be referenced by child branch forked at {} is_dropped: {} is_incremental: {}", - seg, - l.get_start_lsn(), - l.get_end_lsn(), + "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", + l.filename().display(), retain_lsn, - l.is_dropped(), + //is_dropped, // FIXME l.is_incremental(), ); - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_by_branches += 1; - } else { - result.ondisk_nonrelfiles_needed_by_branches += 1; - } + result.layers_needed_by_branches += 1; continue 'outer; } } // 3. Is there a later on-disk layer for this relation? - if !l.is_dropped() - && !layers.newer_image_layer_exists( - l.get_seg_tag(), - l.get_end_lsn(), - disk_consistent_lsn, - ) + // + // The end-LSN is exclusive, while disk_consistent_lsn is + // inclusive. For example, if disk_consistent_lsn is 100, it is + // OK for a delta layer to have end LSN 101, but if the end LSN + // is 102, then it might not have been fully flushed to disk + // before crash. + if !layers.newer_image_layer_exists(&l.get_key_range(), l.get_lsn_range().end, disk_consistent_lsn+1)? { - info!( - "keeping {} {}-{} because it is the latest layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() - ); - if seg.rel.is_relation() { - result.ondisk_relfiles_not_updated += 1; - } else { - result.ondisk_nonrelfiles_not_updated += 1; - } + info!("keeping {} because it is the latest layer", l.filename().display()); + result.layers_not_updated += 1; continue 'outer; } - // 4. Does this layer serve as a tombstone for some older layer? - if l.is_dropped() { - let prior_lsn = l.get_start_lsn().checked_sub(1u64).unwrap(); - - // Check if this layer serves as a tombstone for this timeline - // We have to do this separately from timeline check below, - // because LayerMap of this timeline is already locked. - let mut is_tombstone = layers.layer_exists_at_lsn(l.get_seg_tag(), prior_lsn)?; - if is_tombstone { - info!( - "earlier layer exists at {} in {}", - prior_lsn, self.timelineid - ); - } - // Now check ancestor timelines, if any are present locally - else if let Some(ancestor) = - self.ancestor_timeline.as_ref().and_then(|timeline_entry| { - timeline_entry.local_or_schedule_download(self.tenantid) - }) - { - let prior_lsn = ancestor.get_last_record_lsn(); - if seg.rel.is_blocky() { - info!( - "check blocky relish size {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - match ancestor.get_relish_size(seg.rel, prior_lsn).unwrap() { - Some(size) => { - let (last_live_seg, _rel_blknum) = - SegmentTag::from_blknum(seg.rel, size - 1); - info!( - "blocky rel size is {} last_live_seg.segno {} seg.segno {}", - size, last_live_seg.segno, seg.segno - ); - if last_live_seg.segno >= seg.segno { - is_tombstone = true; - } - } - _ => { - info!("blocky rel doesn't exist"); - } - } - } else { - info!( - "check non-blocky relish existence {} at {} in {} for layer {}-{}", - seg, - prior_lsn, - ancestor.timelineid, - l.get_start_lsn(), - l.get_end_lsn() - ); - is_tombstone = ancestor.get_rel_exists(seg.rel, prior_lsn).unwrap_or(false); - } - } - - if is_tombstone { - info!( - "keeping {} {}-{} because this layer serves as a tombstone for older layer", - seg, - l.get_start_lsn(), - l.get_end_lsn() - ); - - if seg.rel.is_relation() { - result.ondisk_relfiles_needed_as_tombstone += 1; - } else { - result.ondisk_nonrelfiles_needed_as_tombstone += 1; - } - continue 'outer; - } - } - // We didn't find any reason to keep this file, so remove it. info!( - "garbage collecting {} {}-{} is_dropped: {} is_incremental: {}", - l.get_seg_tag(), - l.get_start_lsn(), - l.get_end_lsn(), - l.is_dropped(), + "garbage collecting {} is_dropped: xx is_incremental: {}", + l.filename().display(), l.is_incremental(), ); layers_to_remove.push(Arc::clone(&l)); @@ -1884,270 +1806,97 @@ impl LayeredTimeline { doomed_layer.delete()?; layers.remove_historic(doomed_layer.clone()); - match ( - doomed_layer.is_dropped(), - doomed_layer.get_seg_tag().rel.is_relation(), - ) { - (true, true) => result.ondisk_relfiles_dropped += 1, - (true, false) => result.ondisk_nonrelfiles_dropped += 1, - (false, true) => result.ondisk_relfiles_removed += 1, - (false, false) => result.ondisk_nonrelfiles_removed += 1, - } + result.layers_removed += 1; } result.elapsed = now.elapsed(); Ok(result) } - fn lookup_cached_page( + /// + /// Reconstruct a value, using the given base image and WAL records in 'data'. + /// + fn reconstruct_value( &self, - rel: &RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - ) -> Option<(Lsn, Bytes)> { - let cache = page_cache::get(); - if let RelishTag::Relation(rel_tag) = &rel { - let (lsn, read_guard) = cache.lookup_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - lsn, - )?; - let img = Bytes::from(read_guard.to_vec()); - Some((lsn, img)) - } else { - None - } - } - - /// - /// Reconstruct a page version from given Layer - /// - fn materialize_page( - &self, - seg: SegmentTag, - seg_blknum: SegmentBlk, - lsn: Lsn, - layer: &dyn Layer, - ) -> Result { - // Check the page cache. We will get back the most recent page with lsn <= `lsn`. - // The cached image can be returned directly if there is no WAL between the cached image - // and requested LSN. The cached image can also be used to reduce the amount of WAL needed - // for redo. - let rel = seg.rel; - let rel_blknum = seg.segno * RELISH_SEG_SIZE + seg_blknum; - let cached_page_img = match self.lookup_cached_page(&rel, rel_blknum, lsn) { - Some((cached_lsn, cached_img)) => { - match cached_lsn.cmp(&lsn) { - cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check - cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image - cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn - } - Some((cached_lsn, cached_img)) - } - None => None, - }; - - let mut data = PageReconstructData { - records: Vec::new(), - page_img: cached_page_img, - }; - - // Holds an Arc reference to 'layer_ref' when iterating in the loop below. - let mut layer_arc: Arc; - - // Call the layer's get_page_reconstruct_data function to get the base image - // and WAL records needed to materialize the page. If it returns 'Continue', - // call it again on the predecessor layer until we have all the required data. - let mut layer_ref = layer; - let mut curr_lsn = lsn; - loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; - match result { - PageReconstructResult::Complete => break, - PageReconstructResult::Continue(cont_lsn) => { - // Fetch base image / more WAL from the returned predecessor layer - if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? { - if cont_lsn == curr_lsn { - // We landed on the same layer again. Shouldn't happen, but if it does, - // don't get stuck in an infinite loop. - bail!( - "could not find predecessor of layer {} at {}, layer returned its own LSN", - layer_ref.filename().display(), - cont_lsn - ); - } - layer_arc = cont_layer; - layer_ref = &*layer_arc; - curr_lsn = cont_lsn; - continue; - } else { - bail!( - "could not find predecessor of layer {} at {}", - layer_ref.filename().display(), - cont_lsn - ); - } - } - PageReconstructResult::Missing(lsn) => { - // Oops, we could not reconstruct the page. - if data.records.is_empty() { - // no records, and no base image. This can happen if PostgreSQL extends a relation - // but never writes the page. - // - // Would be nice to detect that situation better. - warn!("Page {} blk {} at {} not found", rel, rel_blknum, lsn); - return Ok(ZERO_PAGE.clone()); - } - bail!( - "No base image found for page {} blk {} at {}/{}", - rel, - rel_blknum, - self.timelineid, - lsn, - ); - } - } - } - - self.reconstruct_page(rel, rel_blknum, lsn, data) - } - - /// - /// Reconstruct a page version, using the given base image and WAL records in 'data'. - /// - fn reconstruct_page( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, + key: Key, request_lsn: Lsn, - mut data: PageReconstructData, + mut data: ValueReconstructState, ) -> Result { // Perform WAL redo if needed data.records.reverse(); // If we have a page image, and no WAL, we're all set if data.records.is_empty() { - if let Some((img_lsn, img)) = &data.page_img { + if let Some((img_lsn, img)) = &data.img { trace!( - "found page image for blk {} in {} at {}, no WAL redo required", - rel_blknum, - rel, + "found page image for key {} at {}, no WAL redo required", + key, img_lsn ); Ok(img.clone()) } else { - // FIXME: this ought to be an error? - warn!( - "Page {} blk {} at {} not found", - rel, rel_blknum, request_lsn - ); - Ok(ZERO_PAGE.clone()) + bail!("base image for {} at {} not found", key, request_lsn); } } else { // We need to do WAL redo. // // If we don't have a base image, then the oldest WAL record better initialize // the page - if data.page_img.is_none() && !data.records.first().unwrap().1.will_init() { - // FIXME: this ought to be an error? - warn!( - "Base image for page {}/{} at {} not found, but got {} WAL records", - rel, - rel_blknum, + if data.img.is_none() && !data.records.first().unwrap().1.will_init() { + bail!( + "Base image for {} at {} not found, but got {} WAL records", + key, request_lsn, data.records.len() ); - Ok(ZERO_PAGE.clone()) } else { - let base_img = if let Some((_lsn, img)) = data.page_img { - trace!("found {} WAL records and a base image for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); + let base_img = if let Some((_lsn, img)) = data.img { + trace!( + "found {} WAL records and a base image for {} at {}, performing WAL redo", + data.records.len(), + key, + request_lsn + ); Some(img) } else { - trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), rel_blknum, rel, request_lsn); + trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn); None }; - let last_rec_lsn = data.records.last().unwrap().0; + //let last_rec_lsn = data.records.last().unwrap().0; - let img = self.walredo_mgr.request_redo( - rel, - rel_blknum, - request_lsn, - base_img, - data.records, - )?; + let img = + self.walredo_mgr + .request_redo(key, request_lsn, base_img, data.records)?; - if let RelishTag::Relation(rel_tag) = &rel { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - last_rec_lsn, - &img, - ); - } + // FIXME + /* + if let RelishTag::Relation(rel_tag) = &rel { + let cache = page_cache::get(); + cache.memorize_materialized_page( + self.tenantid, + self.timelineid, + *rel_tag, + rel_blknum, + last_rec_lsn, + &img, + ); + } + */ Ok(img) } } } - - /// - /// This is a helper function to increase current_total_relation_size - /// - fn increase_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_add(diff as usize, atomic::Ordering::SeqCst); - trace!( - "increase_current_logical_size: {} + {} = {}", - val, - diff, - val + diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 + diff as i64); - } - - /// - /// This is a helper function to decrease current_total_relation_size - /// - fn decrease_current_logical_size(&self, diff: u32) { - let val = self - .current_logical_size - .fetch_sub(diff as usize, atomic::Ordering::SeqCst); - trace!( - "decrease_current_logical_size: {} - {} = {}", - val, - diff, - val - diff as usize, - ); - self.current_logical_size_gauge - .set(val as i64 - diff as i64); - } } -pub struct LayeredTimelineWriter<'a> { +struct LayeredTimelineWriter<'a> { tl: &'a LayeredTimeline, _write_guard: MutexGuard<'a, ()>, } impl Deref for LayeredTimelineWriter<'_> { - type Target = LayeredTimeline; + type Target = dyn Timeline; fn deref(&self) -> &Self::Target { self.tl @@ -2155,149 +1904,12 @@ impl Deref for LayeredTimelineWriter<'_> { } impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { - fn put_wal_record( - &self, - lsn: Lsn, - rel: RelishTag, - rel_blknum: u32, - rec: ZenithWalRecord, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_wal_record(lsn, seg_blknum, rec)?; - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) + fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> { + self.tl.put_value(key, lsn, value) } - fn put_page_image( - &self, - rel: RelishTag, - rel_blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()> { - if !rel.is_blocky() && rel_blknum != 0 { - bail!( - "invalid request for block {} for non-blocky relish {}", - rel_blknum, - rel - ); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - let delta_size = layer.put_page_image(seg_blknum, lsn, img)?; - - self.tl - .increase_current_logical_size(delta_size * BLCKSZ as u32); - Ok(()) - } - - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, relsize: BlockNumber) -> Result<()> { - if !rel.is_blocky() { - bail!("invalid truncation for non-blocky relish {}", rel); - } - ensure!(lsn.is_aligned(), "unaligned record LSN"); - - debug!("put_truncation: {} to {} blocks at {}", rel, relsize, lsn); - - let oldsize = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - .with_context(|| { - format!( - "attempted to truncate non-existent relish {} at {}", - rel, lsn - ) - })?; - - if oldsize <= relsize { - return Ok(()); - } - let old_last_seg = (oldsize - 1) / RELISH_SEG_SIZE; - - let last_remain_seg = if relsize == 0 { - 0 - } else { - (relsize - 1) / RELISH_SEG_SIZE - }; - - // Drop segments beyond the last remaining segment. - for remove_segno in (last_remain_seg + 1)..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - // Truncate the last remaining segment to the specified size - if relsize == 0 || relsize % RELISH_SEG_SIZE != 0 { - let seg = SegmentTag { - rel, - segno: last_remain_seg, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.put_truncation(lsn, relsize % RELISH_SEG_SIZE) - } - self.tl - .decrease_current_logical_size((oldsize - relsize) * BLCKSZ as u32); - Ok(()) - } - - fn drop_relish(&self, rel: RelishTag, lsn: Lsn) -> Result<()> { - trace!("drop_segment: {} at {}", rel, lsn); - - if rel.is_blocky() { - if let Some(oldsize) = self - .tl - .get_relish_size(rel, self.tl.get_last_record_lsn())? - { - let old_last_seg = if oldsize == 0 { - 0 - } else { - (oldsize - 1) / RELISH_SEG_SIZE - }; - - // Drop all segments of the relish - for remove_segno in 0..=old_last_seg { - let seg = SegmentTag { - rel, - segno: remove_segno, - }; - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - self.tl - .decrease_current_logical_size(oldsize * BLCKSZ as u32); - } else { - warn!( - "drop_segment called on non-existent relish {} at {}", - rel, lsn - ); - } - } else { - // TODO handle TwoPhase relishes - let (seg, _seg_blknum) = SegmentTag::from_blknum(rel, 0); - let layer = self.tl.get_layer_for_write(seg, lsn)?; - layer.drop_segment(lsn); - } - - Ok(()) + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()> { + self.tl.put_tombstone(key_range, lsn) } /// @@ -2356,6 +1968,8 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { mod tests { use super::*; use crate::repository::repo_harness::*; + use rand::thread_rng; + use rand::Rng; #[test] fn corrupt_metadata() -> Result<()> { @@ -2386,113 +2000,133 @@ mod tests { Ok(()) } - /// - /// Test the logic in 'load_layer_map' that removes layer files that are - /// newer than 'disk_consistent_lsn'. - /// #[test] - fn future_layerfiles() -> Result<()> { - const TEST_NAME: &str = "future_layerfiles"; - let harness = RepoHarness::create(TEST_NAME)?; - let repo = harness.load(); + fn test_images() -> Result<()> { + let repo = RepoHarness::create("test_images")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + #[allow(non_snake_case)] + let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - // Create a timeline with disk_consistent_lsn = 8000 - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?; let writer = tline.writer(); - writer.advance_last_record_lsn(Lsn(0x8000)); + writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.advance_last_record_lsn(Lsn(0x10)); drop(writer); - repo.checkpoint_iteration(CheckpointConfig::Forced)?; - drop(repo); - let timeline_path = harness.timeline_path(&TIMELINE_ID); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.create_images(1)?; - let make_empty_file = |filename: &str| -> std::io::Result<()> { - let path = timeline_path.join(filename); + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.advance_last_record_lsn(Lsn(0x20)); + drop(writer); - assert!(!path.exists()); - std::fs::write(&path, &[])?; + tline.checkpoint(CheckpointConfig::Forced)?; + tline.create_images(1)?; - Ok(()) - }; + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; + writer.advance_last_record_lsn(Lsn(0x30)); + drop(writer); - // Helper function to check that a relation file exists, and a corresponding - // .0.old file does not. - let assert_exists = |filename: &str| { - let path = timeline_path.join(filename); - assert!(path.exists(), "file {} was removed", filename); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.create_images(1)?; - // Check that there is no .old file - let backup_path = timeline_path.join(format!("{}.0.old", filename)); - assert!( - !backup_path.exists(), - "unexpected backup file {}", - backup_path.display() - ); - }; + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; + writer.advance_last_record_lsn(Lsn(0x40)); + drop(writer); - // Helper function to check that a relation file does *not* exists, and a corresponding - // ..old file does. - let assert_is_renamed = |filename: &str, num: u32| { - let path = timeline_path.join(filename); - assert!( - !path.exists(), - "file {} was not removed as expected", - filename - ); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.create_images(1)?; - let backup_path = timeline_path.join(format!("{}.{}.old", filename, num)); - assert!( - backup_path.exists(), - "backup file {} was not created", - backup_path.display() - ); - }; + assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40")); - // These files are considered to be in the future and will be renamed out - // of the way - let future_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8001), - format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008), - ]; - // But these are not: - let past_filenames = vec![ - format!("pg_control_0_{:016X}", 0x8000), - format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001), - ]; + Ok(()) + } - for filename in future_filenames.iter().chain(past_filenames.iter()) { - make_empty_file(filename)?; + #[test] + fn test_bulk_insert() -> Result<()> { + let repo = RepoHarness::create("test_bulk_insert")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + let mut lsn = Lsn(0x10); + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + let mut blknum = 0; + for _ in 1..100 { + for _ in 1..10000 { + test_key.field6 = blknum; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.advance_last_record_lsn(lsn); + drop(writer); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; + } + tline.checkpoint(CheckpointConfig::Forced)?; + //tline.create_images(1)?; } - // Load the timeline. This will cause the files in the "future" to be renamed - // away. - let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); - drop(new_repo); + Ok(()) + } - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - } - for filename in past_filenames.iter() { - assert_exists(filename); + #[test] + fn test_random_updates() -> Result<()> { + let repo = RepoHarness::create("test_random_updates")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + const NUM_KEYS: usize = 20000; + + let mut lsn = Lsn(0x10); + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + let mut blknum = 0; + for _ in 0..NUM_KEYS { + test_key.field6 = blknum; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.advance_last_record_lsn(lsn); + drop(writer); + + lsn = Lsn(lsn.0 + 0x10); + blknum += 1; } - // Create the future files again, and load again. They should be renamed to - // *.1.old this time. - for filename in future_filenames.iter() { - make_empty_file(filename)?; - } + for _ in 0..100 { + for _ in 0..NUM_KEYS { + blknum = thread_rng().gen_range(0..10000); + test_key.field6 = blknum; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.advance_last_record_lsn(lsn); + drop(writer); - let new_repo = harness.load(); - new_repo.get_timeline(TIMELINE_ID).unwrap(); - drop(new_repo); + lsn = Lsn(lsn.0 + 0x10); + } - for filename in future_filenames.iter() { - assert_is_renamed(filename, 0); - assert_is_renamed(filename, 1); - } - for filename in past_filenames.iter() { - assert_exists(filename); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.create_images(3)?; + tline.gc()?; } Ok(()) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 7434b8de11..e7d916d1e4 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -1,6 +1,5 @@ -//! //! A DeltaLayer represents a collection of WAL records or page images in a range of -//! LSNs, for one segment. It is stored on a file on disk. +//! LSNs, and in a range of Keys. It is stored on a file on disk. //! //! Usually a delta layer only contains differences - in the form of WAL records against //! a base LSN. However, if a segment is newly created, by creating a new relation or @@ -11,56 +10,53 @@ //! can happen when you create a new branch in the middle of a delta layer, and the WAL //! records on the new branch are put in a new delta layer. //! -//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters +//! When a delta file needs to be accessed, we slurp the 'index' metadata //! into memory, into the DeltaLayerInner struct. See load() and unload() functions. -//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN. -//! The byte ranges in the metadata can be used to find the page/WAL record in -//! PAGE_VERSIONS_CHAPTER. +//! To access a particular value, we search `index` for the given key. +//! The byte offset in the index can be used to find the value in +//! VALUES_CHAPTER. //! //! On disk, the delta files are stored in timelines/ directory. //! Currently, there are no subdirectories, and each delta file is named like this: //! -//! ______ +//! -__- page/WAL record -/// byte ranges in PAGE_VERSIONS_CHAPTER -static PAGE_VERSION_METAS_CHAPTER: u64 = 1; +/// Mapping from (key, lsn) -> page/WAL record +/// byte ranges in VALUES_CHAPTER +static INDEX_CHAPTER: u64 = 1; + /// Page/WAL bytes - cannot be interpreted -/// without PAGE_VERSION_METAS_CHAPTER -static PAGE_VERSIONS_CHAPTER: u64 = 2; -static SEG_SIZES_CHAPTER: u64 = 3; +/// without the page versions from the INDEX_CHAPTER +static VALUES_CHAPTER: u64 = 2; /// Contains the [`Summary`] struct -static SUMMARY_CHAPTER: u64 = 4; +static SUMMARY_CHAPTER: u64 = 3; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] struct Summary { tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, - - start_lsn: Lsn, - end_lsn: Lsn, - - dropped: bool, + key_range: Range, + lsn_range: Range, } impl From<&DeltaLayer> for Summary { @@ -96,33 +88,17 @@ impl From<&DeltaLayer> for Summary { Self { tenantid: layer.tenantid, timelineid: layer.timelineid, - seg: layer.seg, - - start_lsn: layer.start_lsn, - end_lsn: layer.end_lsn, - - dropped: layer.dropped, + key_range: layer.key_range.clone(), + lsn_range: layer.lsn_range.clone(), } } } -#[derive(Serialize, Deserialize)] -struct BlobRange { - offset: u64, - size: usize, -} - -fn read_blob(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result> { - let mut buf = vec![0u8; range.size]; - reader.read_exact_at(&mut buf, range.offset)?; - Ok(buf) -} - /// /// DeltaLayer is the in-memory data structure associated with an /// on-disk delta file. We keep a DeltaLayer in memory for each /// file, in the LayerMap. If a layer is in "loaded" state, we have a -/// copy of the file in memory, in 'inner'. Otherwise the struct is +/// copy of the index in memory, in 'inner'. Otherwise the struct is /// just a placeholder for a file that exists on disk, and it needs to /// be loaded before using it in queries. /// @@ -131,47 +107,24 @@ pub struct DeltaLayer { pub tenantid: ZTenantId, pub timelineid: ZTimelineId, - pub seg: SegmentTag, + pub key_range: Range, + pub lsn_range: Range, - // - // This entry contains all the changes from 'start_lsn' to 'end_lsn'. The - // start is inclusive, and end is exclusive. - // - pub start_lsn: Lsn, - pub end_lsn: Lsn, - - dropped: bool, - - inner: Mutex, + inner: RwLock, } pub struct DeltaLayerInner { - /// If false, the 'page_version_metas' and 'seg_sizes' have not been - /// loaded into memory yet. + /// If false, the 'index' has not been loaded into memory yet. loaded: bool, + /// + /// All versions of all pages in the layer are kept here. + /// Indexed by block number and LSN. The value is an offset into the + /// chapter where the page version is stored. + /// + index: HashMap>, + book: Option>, - - /// All versions of all pages in the file are are kept here. - /// Indexed by block number and LSN. - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, - - /// `seg_sizes` tracks the size of the segment at different points in time. - seg_sizes: VecMap, -} - -impl DeltaLayerInner { - fn get_seg_size(&self, lsn: Lsn) -> Result { - // Scan the VecMap backwards, starting from the given entry. - let slice = self - .seg_sizes - .slice_range((Included(&Lsn(0)), Included(&lsn))); - if let Some((_entry_lsn, entry)) = slice.last() { - Ok(*entry) - } else { - bail!("could not find seg size in delta layer") - } - } } impl Layer for DeltaLayer { @@ -183,40 +136,31 @@ impl Layer for DeltaLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + self.key_range.clone() } - fn is_dropped(&self) -> bool { - self.dropped - } - - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { - self.end_lsn + fn get_lsn_range(&self) -> Range { + self.lsn_range.clone() } fn filename(&self) -> PathBuf { PathBuf::from(self.layer_name().to_string()) } - /// Look up given page in the cache. - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { + lsn_floor: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> Result { let mut need_image = true; - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); + assert!(self.key_range.contains(&reconstruct_state.key)); - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) + match &reconstruct_state.img { + Some((cached_lsn, _)) if &self.lsn_range.end <= cached_lsn => { + reconstruct_state.lsn = *cached_lsn; + return Ok(ValueReconstructResult::Complete); } _ => {} } @@ -224,91 +168,74 @@ impl Layer for DeltaLayer { { // Open the file and lock the metadata in memory let inner = self.load()?; - let page_version_reader = inner + let values_reader = inner .book .as_ref() .expect("should be loaded in load call above") - .chapter_reader(PAGE_VERSIONS_CHAPTER)?; + .chapter_reader(VALUES_CHAPTER)?; - // Scan the metadata VecMap backwards, starting from the given entry. - let minkey = (blknum, Lsn(0)); - let maxkey = (blknum, lsn); - let iter = inner - .page_version_metas - .slice_range((Included(&minkey), Included(&maxkey))) - .iter() - .rev(); - for ((_blknum, pv_lsn), blob_range) in iter { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if pv_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&reconstruct_state.key) { + let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn); + for (entry_lsn, pos) in slice.iter().rev() { + match &reconstruct_state.img { + Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { + reconstruct_state.lsn = *cached_lsn; + return Ok(ValueReconstructResult::Complete); + } + _ => {} } - _ => {} - } - let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?; - - match pv { - PageVersion::Page(img) => { - // Found a page image, return it - reconstruct_data.page_img = Some((*pv_lsn, img)); - need_image = false; - break; - } - PageVersion::Wal(rec) => { - let will_init = rec.will_init(); - reconstruct_data.records.push((*pv_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back + let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); need_image = false; break; } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } } } } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= inner.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - // release metadata lock and close the file } // If an older page image is needed to reconstruct the page, let the // caller know. if need_image { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) + reconstruct_state.lsn = Lsn(self.lsn_range.start.0 - 1); + Ok(ValueReconstructResult::Continue) } else { - Ok(PageReconstructResult::Complete) + Ok(ValueReconstructResult::Complete) } } - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); - + // Return a set of all distinct Keys present in this layer + fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { let inner = self.load()?; - inner.get_seg_size(lsn) + + keys.extend(inner.index.keys().filter(|x| key_range.contains(x))); + Ok(()) } - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { - // Is the requested LSN after the rel was dropped? - if self.dropped && lsn >= self.end_lsn { - return Ok(false); - } + fn iter(&self) -> Box> + '_> { + let inner = self.load().unwrap(); - // Otherwise, it exists. - Ok(true) + let mut pairs: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + pairs.sort_by_key(|x| x.0); + + match DeltaValueIter::new(inner) { + Ok(iter) => Box::new(iter), + Err(err) => Box::new(std::iter::once(Err(err))) + } } /// @@ -316,14 +243,14 @@ impl Layer for DeltaLayer { /// it will need to be loaded back. /// fn unload(&self) -> Result<()> { - let mut inner = self.inner.lock().unwrap(); - inner.page_version_metas = VecMap::default(); - inner.seg_sizes = VecMap::default(); - inner.loaded = false; + if let Ok(mut inner) = self.inner.try_write() { + inner.index = HashMap::default(); + inner.loaded = false; - // Note: we keep the Book open. Is that a good idea? The virtual file - // machinery has its own rules for closing the file descriptor if it's not - // needed, but the Book struct uses up some memory, too. + // Note: we keep the Book open. Is that a good idea? The virtual file + // machinery has its own rules for closing the file descriptor if it's not + // needed, but the Book struct uses up some memory, too. + } Ok(()) } @@ -345,45 +272,52 @@ impl Layer for DeltaLayer { /// debugging function to print out the contents of the layer fn dump(&self) -> Result<()> { println!( - "----- delta layer for ten {} tli {} seg {} {}-{} ----", - self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn + "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----", + self.tenantid, + self.timelineid, + self.key_range.start, + self.key_range.end, + self.lsn_range.start, + self.lsn_range.end ); - println!("--- seg sizes ---"); let inner = self.load()?; - for (k, v) in inner.seg_sizes.as_slice() { - println!(" {}: {}", k, v); - } - println!("--- page versions ---"); let path = self.path(); let file = std::fs::File::open(&path)?; let book = Book::new(file)?; + let chapter = book.chapter_reader(VALUES_CHAPTER)?; - let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?; - for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() { - let mut desc = String::new(); + let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + values.sort_by_key(|k| k.0); - let buf = read_blob(&chapter, blob_range)?; - let pv = PageVersion::des(&buf)?; + for (key, versions) in values { + for (lsn, off) in versions.as_slice() { + let mut desc = String::new(); - match pv { - PageVersion::Page(img) => { - write!(&mut desc, " img {} bytes", img.len())?; - } - PageVersion::Wal(rec) => { - let wal_desc = walrecord::describe_wal_record(&rec); - write!( - &mut desc, - " rec {} bytes will_init: {} {}", - blob_range.size, - rec.will_init(), - wal_desc - )?; + let buf = utils::read_blob_from_chapter(&chapter, *off)?; + let val = Value::des(&buf); + + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } } + println!(" key {} at {}: {}", key, lsn, desc); } - - println!(" blk {} at {}: {}", blk, lsn, desc); } Ok(()) @@ -408,61 +342,61 @@ impl DeltaLayer { /// /// Load the contents of the file into memory /// - fn load(&self) -> Result> { - // quick exit if already loaded - let mut inner = self.inner.lock().unwrap(); + fn load(&self) -> Result> { + loop { + // quick exit if already loaded + { + let inner = self.inner.read().unwrap(); - if inner.loaded { - return Ok(inner); - } - - let path = self.path(); - - // Open the file if it's not open already. - if inner.book.is_none() { - let file = VirtualFile::open(&path)?; - inner.book = Some(Book::new(file)?); - } - let book = inner.book.as_ref().unwrap(); - - match &self.path_or_conf { - PathOrConf::Conf(_) => { - let chapter = book.read_chapter(SUMMARY_CHAPTER)?; - let actual_summary = Summary::des(&chapter)?; - - let expected_summary = Summary::from(self); - - if actual_summary != expected_summary { - bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); + if inner.loaded { + return Ok(inner); } } - PathOrConf::Path(path) => { - let actual_filename = Path::new(path.file_name().unwrap()); - let expected_filename = self.filename(); + // need to upgrade to write lock + let mut inner = self.inner.write().unwrap(); - if actual_filename != expected_filename { - println!( - "warning: filename does not match what is expected from in-file summary" - ); - println!("actual: {:?}", actual_filename); - println!("expected: {:?}", expected_filename); + let path = self.path(); + + // Open the file if it's not open already. + if inner.book.is_none() { + let file = VirtualFile::open(&path)?; + inner.book = Some(Book::new(file)?); + } + let book = inner.book.as_ref().unwrap(); + + match &self.path_or_conf { + PathOrConf::Conf(_) => { + let chapter = book.read_chapter(SUMMARY_CHAPTER)?; + let actual_summary = Summary::des(&chapter)?; + + let expected_summary = Summary::from(self); + + if actual_summary != expected_summary { + bail!("in-file summary does not match expected summary. actual = {:?} expected = {:?}", actual_summary, expected_summary); + } + } + PathOrConf::Path(path) => { + let actual_filename = Path::new(path.file_name().unwrap()); + let expected_filename = self.filename(); + + if actual_filename != expected_filename { + println!( + "warning: filename does not match what is expected from in-file summary" + ); + println!("actual: {:?}", actual_filename); + println!("expected: {:?}", expected_filename); + } } } + + let chapter = book.read_chapter(INDEX_CHAPTER)?; + let index = HashMap::des(&chapter)?; + + debug!("loaded from {}", &path.display()); + + inner.index = index; + inner.loaded = true; } - - let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?; - let page_version_metas = VecMap::des(&chapter)?; - - let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?; - let seg_sizes = VecMap::des(&chapter)?; - - debug!("loaded from {}", &path.display()); - - inner.page_version_metas = page_version_metas; - inner.seg_sizes = seg_sizes; - inner.loaded = true; - - Ok(inner) } /// Create a DeltaLayer struct representing an existing file on disk. @@ -476,15 +410,12 @@ impl DeltaLayer { path_or_conf: PathOrConf::Conf(conf), timelineid, tenantid, - seg: filename.seg, - start_lsn: filename.start_lsn, - end_lsn: filename.end_lsn, - dropped: filename.dropped, - inner: Mutex::new(DeltaLayerInner { + key_range: filename.key_range.clone(), + lsn_range: filename.lsn_range.clone(), + inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), + index: HashMap::default(), }), } } @@ -494,7 +425,7 @@ impl DeltaLayer { /// This variant is only used for debugging purposes, by the 'dump_layerfile' binary. pub fn new_for_path(path: &Path, book: &Book) -> Result where - F: std::os::unix::prelude::FileExt, + F: FileExt, { let chapter = book.read_chapter(SUMMARY_CHAPTER)?; let summary = Summary::des(&chapter)?; @@ -503,25 +434,20 @@ impl DeltaLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timelineid: summary.timelineid, tenantid: summary.tenantid, - seg: summary.seg, - start_lsn: summary.start_lsn, - end_lsn: summary.end_lsn, - dropped: summary.dropped, - inner: Mutex::new(DeltaLayerInner { + key_range: summary.key_range, + lsn_range: summary.lsn_range, + inner: RwLock::new(DeltaLayerInner { loaded: false, book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), + index: HashMap::default(), }), }) } fn layer_name(&self) -> DeltaFileName { DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, + key_range: self.key_range.clone(), + lsn_range: self.lsn_range.clone(), } } @@ -542,24 +468,24 @@ impl DeltaLayer { /// /// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...) /// -/// 2. Write the contents by calling `put_page_version` for every page +/// 2. Write the contents by calling `put_value` for every page /// version to store in the layer. /// /// 3. Call `finish`. /// pub struct DeltaLayerWriter { conf: &'static PageServerConf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, - page_version_writer: ChapterWriter>, - pv_offset: u64, + key_start: Key, + lsn_range: Range, - page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>, + index: HashMap>, + + values_writer: ChapterWriter>, + end_offset: u64, } impl DeltaLayerWriter { @@ -570,94 +496,85 @@ impl DeltaLayerWriter { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, - start_lsn: Lsn, - end_lsn: Lsn, - dropped: bool, + key_start: Key, + lsn_range: Range, ) -> Result { // Create the file // // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = DeltaLayer::path_for( - &PathOrConf::Conf(conf), - timelineid, - tenantid, - &DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, - }, - ); + + let path = conf + .timeline_path(&timelineid, &tenantid) + .join(format!("{}-XXX__{:016X}-{:016X}.temp", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end))); + info!("temp deltalayer path {}", path.display()); let file = VirtualFile::create(&path)?; let buf_writer = BufWriter::new(file); let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?; // Open the page-versions chapter for writing. The calls to - // `put_page_version` will use this to write the contents. - let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER); + // `put_value` will use this to write the contents. + let values_writer = book.new_chapter(VALUES_CHAPTER); Ok(DeltaLayerWriter { conf, + path, timelineid, tenantid, - seg, - start_lsn, - end_lsn, - dropped, - page_version_writer, - page_version_metas: VecMap::default(), - pv_offset: 0, + key_start, + lsn_range, + index: HashMap::new(), + values_writer, + end_offset: 0, }) } /// - /// Append a page version to the file. + /// Append a key-value pair to the file. /// - /// 'buf' is a serialized PageVersion. - /// The page versions must be appended in blknum, lsn order. + /// The values must be appended in key, lsn order. /// - pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> { + pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + //info!("DELTA: key {} at {} on {}", key, lsn, self.path.display()); + assert!(self.lsn_range.start <= lsn); // Remember the offset and size metadata. The metadata is written // to a separate chapter, in `finish`. - let blob_range = BlobRange { - offset: self.pv_offset, - size: buf.len(), - }; - self.page_version_metas - .append((blknum, lsn), blob_range) - .unwrap(); - - // write the page version - self.page_version_writer.write_all(buf)?; - self.pv_offset += buf.len() as u64; + let off = self.end_offset; + let len = utils::write_blob(&mut self.values_writer, &Value::ser(&val)?)?; + self.end_offset += len; + let vec_map = self.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + if old.is_some() { + // We already had an entry for this LSN. That's odd.. + bail!( + "Value for {} at {} already exists in delta layer being built", + key, + lsn + ); + } Ok(()) } + pub fn size(&self) -> u64 { + self.end_offset + } + /// /// Finish writing the delta layer. /// /// 'seg_sizes' is a list of size changes to store with the actual data. /// - pub fn finish(self, seg_sizes: VecMap) -> Result { - // Close the page-versions chapter - let book = self.page_version_writer.close()?; + pub fn finish(self, key_end: Key) -> Result { + // Close the values chapter + let book = self.values_writer.close()?; - // Write out page versions metadata - let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER); - let buf = VecMap::ser(&self.page_version_metas)?; - chapter.write_all(&buf)?; - let book = chapter.close()?; - - if self.seg.rel.is_blocky() { - assert!(!seg_sizes.is_empty()); - } - - // and seg_sizes to separate chapter - let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER); - let buf = VecMap::ser(&seg_sizes)?; + // Write out the index + let mut chapter = book.new_chapter(INDEX_CHAPTER); + let buf = HashMap::ser(&self.index)?; chapter.write_all(&buf)?; let book = chapter.close()?; @@ -665,12 +582,8 @@ impl DeltaLayerWriter { let summary = Summary { tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, - - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - - dropped: self.dropped, + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), }; Summary::ser_into(&summary, &mut chapter)?; let book = chapter.close()?; @@ -685,20 +598,111 @@ impl DeltaLayerWriter { path_or_conf: PathOrConf::Conf(self.conf), tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn: self.end_lsn, - dropped: self.dropped, - inner: Mutex::new(DeltaLayerInner { + key_range: self.key_start..key_end, + lsn_range: self.lsn_range.clone(), + inner: RwLock::new(DeltaLayerInner { loaded: false, + index: HashMap::new(), book: None, - page_version_metas: VecMap::default(), - seg_sizes: VecMap::default(), }), }; trace!("created delta layer {}", &layer.path().display()); + // Rename the file to its final name + // + // Note: This overwrites any existing file. There shouldn't be any. + // FIXME: throw an error instead? + let final_path = DeltaLayer::path_for( + &PathOrConf::Conf(self.conf), + self.timelineid, + self.tenantid, + &DeltaFileName { + key_range: self.key_start..key_end, + lsn_range: self.lsn_range, + }, + ); + std::fs::rename(self.path, final_path)?; + Ok(layer) } + + pub fn abort(self) { + match self.values_writer.close() { + Ok(book) => { + if let Err(err) = book.close() { + error!("error while closing delta layer file: {}", err); + } + } + Err(err) => { + error!("error while closing chapter writer: {}", err); + } + } + if let Err(err) = std::fs::remove_file(self.path) { + error!("error removing unfinished delta layer file: {}", err); + } + } } + +struct DeltaValueIter<'a> { + all_offsets: Vec<(Key, Lsn, u64)>, + next_idx: usize, + + inner: RwLockReadGuard<'a, DeltaLayerInner>, +} + +impl<'a> Iterator for DeltaValueIter<'a> { + type Item = Result<(Key, Lsn, Value)>; + + fn next(&mut self) -> Option { + self.next_res().transpose() + } +} + +/// +/// Iterator over all key-value pairse stored in a delta layer +/// +/// FIXME: This creates a Vector to hold the offsets of all key value pairs. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// +impl<'a> DeltaValueIter<'a> { + fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { + + let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + index.sort_by_key(|x| x.0); + + let mut all_offsets: Vec<(Key, Lsn, u64)> = Vec::new(); + for (key, vec_map) in index.iter() { + for (lsn, off) in vec_map.as_slice().iter() { + all_offsets.push((**key, *lsn, *off)); + } + } + + Ok(DeltaValueIter { + all_offsets, + inner, + next_idx: 0, + }) + } + + fn next_res(&mut self) -> Result> { + if self.next_idx < self.all_offsets.len() { + let (key, lsn, off) = self.all_offsets[self.next_idx]; + + let values_reader = self.inner + .book + .as_ref() + .expect("should be loaded in load call above") + .chapter_reader(VALUES_CHAPTER)?; + + let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?; + + self.next_idx += 1; + Ok(Some((key, lsn, val))) + } else { + Ok(None) + } + } +} + diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index df23700dfd..e9d9c9dbbd 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -2,29 +2,52 @@ //! Helper functions for dealing with filenames of the image and delta layer files. //! use crate::config::PageServerConf; -use crate::layered_repository::storage_layer::SegmentTag; -use crate::relish::*; +use crate::repository::Key; +use std::cmp::Ordering; use std::fmt; +use std::ops::Range; use std::path::PathBuf; use zenith_utils::lsn::Lsn; // Note: LayeredTimeline::load_layer_map() relies on this sort order -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct DeltaFileName { - pub seg: SegmentTag, - pub start_lsn: Lsn, - pub end_lsn: Lsn, - pub dropped: bool, + pub key_range: Range, + pub lsn_range: Range, +} + +impl PartialOrd for DeltaFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for DeltaFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.start.cmp(&other.lsn_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn_range.end.cmp(&other.lsn_range.end); + + cmp + } } /// Represents the filename of a DeltaLayer /// -/// ______ -/// -/// or if it was dropped: -/// -/// _______DROPPED +/// --- /// impl DeltaFileName { /// @@ -32,234 +55,124 @@ impl DeltaFileName { /// match the expected pattern. /// pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + let mut lsn_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_start_str = lsn_parts.next()?; + let lsn_end_str = lsn_parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() { return None; } - let segno = parts.next()?.parse::().ok()?; + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; - let seg = SegmentTag { rel, segno }; + let start_lsn = Lsn::from_hex(lsn_start_str).ok()?; + let end_lsn = Lsn::from_hex(lsn_end_str).ok()?; - let start_lsn = Lsn::from_hex(parts.next()?).ok()?; - let end_lsn = Lsn::from_hex(parts.next()?).ok()?; - - let mut dropped = false; - if let Some(suffix) = parts.next() { - if suffix == "DROPPED" { - dropped = true; - } else { - return None; - } - } - if parts.next().is_some() { + if start_lsn >= end_lsn { return None; + // or panic? + } + + if key_start >= key_end { + return None; + // or panic? } Some(DeltaFileName { - seg, - start_lsn, - end_lsn, - dropped, + key_range: key_start..key_end, + lsn_range: start_lsn..end_lsn, }) } } impl fmt::Display for DeltaFileName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - write!( f, - "{}_{}_{:016X}_{:016X}{}", - basename, - self.seg.segno, - u64::from(self.start_lsn), - u64::from(self.end_lsn), - if self.dropped { "_DROPPED" } else { "" } + "{}-{}__{:016X}-{:016X}", + self.key_range.start, + self.key_range.end, + u64::from(self.lsn_range.start), + u64::from(self.lsn_range.end), ) } } -#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)] +#[derive(Debug, PartialEq, Eq, Clone)] pub struct ImageFileName { - pub seg: SegmentTag, + pub key_range: Range, pub lsn: Lsn, } +impl PartialOrd for ImageFileName { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for ImageFileName { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.key_range.start.cmp(&other.key_range.start); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.key_range.end.cmp(&other.key_range.end); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.lsn.cmp(&other.lsn); + + cmp + } +} + /// /// Represents the filename of an ImageLayer /// /// _____ -/// +/// FIXME impl ImageFileName { /// /// Parse a string as an image file name. Returns None if the filename does not /// match the expected pattern. /// pub fn parse_str(fname: &str) -> Option { - let rel; - let mut parts; - if let Some(rest) = fname.strip_prefix("rel_") { - parts = rest.split('_'); - rel = RelishTag::Relation(RelTag { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - relnode: parts.next()?.parse::().ok()?, - forknum: parts.next()?.parse::().ok()?, - }); - } else if let Some(rest) = fname.strip_prefix("pg_xact_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::Clog, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") { - parts = rest.split('_'); - rel = RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno: u32::from_str_radix(parts.next()?, 16).ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") { - parts = rest.split('_'); - rel = RelishTag::FileNodeMap { - spcnode: parts.next()?.parse::().ok()?, - dbnode: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_twophase_") { - parts = rest.split('_'); - rel = RelishTag::TwoPhase { - xid: parts.next()?.parse::().ok()?, - }; - } else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") { - parts = rest.split('_'); - rel = RelishTag::Checkpoint; - } else if let Some(rest) = fname.strip_prefix("pg_control_") { - parts = rest.split('_'); - rel = RelishTag::ControlFile; - } else { + let mut parts = fname.split("__"); + let mut key_parts = parts.next()?.split('-'); + + let key_start_str = key_parts.next()?; + let key_end_str = key_parts.next()?; + let lsn_str = parts.next()?; + if parts.next().is_some() || key_parts.next().is_some() { return None; } - let segno = parts.next()?.parse::().ok()?; + let key_start = Key::from_hex(key_start_str).ok()?; + let key_end = Key::from_hex(key_end_str).ok()?; - let seg = SegmentTag { rel, segno }; + let lsn = Lsn::from_hex(lsn_str).ok()?; - let lsn = Lsn::from_hex(parts.next()?).ok()?; - - if parts.next().is_some() { - return None; - } - - Some(ImageFileName { seg, lsn }) + Some(ImageFileName { + key_range: key_start..key_end, + lsn, + }) } } impl fmt::Display for ImageFileName { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let basename = match self.seg.rel { - RelishTag::Relation(reltag) => format!( - "rel_{}_{}_{}_{}", - reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum - ), - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - } => format!("pg_xact_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - } => format!("pg_multixact_members_{:04X}", segno), - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - } => format!("pg_multixact_offsets_{:04X}", segno), - RelishTag::FileNodeMap { spcnode, dbnode } => { - format!("pg_filenodemap_{}_{}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid), - RelishTag::Checkpoint => "pg_control_checkpoint".to_string(), - RelishTag::ControlFile => "pg_control".to_string(), - }; - write!( f, - "{}_{}_{:016X}", - basename, - self.seg.segno, + "{}-{}__{:016X}", + self.key_range.start, + self.key_range.end, u64::from(self.lsn), ) } diff --git a/pageserver/src/layered_repository/global_layer_map.rs b/pageserver/src/layered_repository/global_layer_map.rs deleted file mode 100644 index 169a89650a..0000000000 --- a/pageserver/src/layered_repository/global_layer_map.rs +++ /dev/null @@ -1,142 +0,0 @@ -//! -//! Global registry of open layers. -//! -//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered -//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of -//! in-memory layers in the system, and know when we need to evict some to release -//! memory. -//! -//! Each layer is assigned a unique ID when it's registered in the global registry. -//! The ID can be used to relocate the layer later, without having to hold locks. -//! - -use std::sync::atomic::{AtomicU8, Ordering}; -use std::sync::{Arc, RwLock}; - -use super::inmemory_layer::InMemoryLayer; - -use lazy_static::lazy_static; - -const MAX_USAGE_COUNT: u8 = 5; - -lazy_static! { - pub static ref GLOBAL_LAYER_MAP: RwLock = - RwLock::new(InMemoryLayers::default()); -} - -// TODO these types can probably be smaller -#[derive(PartialEq, Eq, Clone, Copy)] -pub struct LayerId { - index: usize, - tag: u64, // to avoid ABA problem -} - -enum SlotData { - Occupied(Arc), - /// Vacant slots form a linked list, the value is the index - /// of the next vacant slot in the list. - Vacant(Option), -} - -struct Slot { - tag: u64, - data: SlotData, - usage_count: AtomicU8, // for clock algorithm -} - -#[derive(Default)] -pub struct InMemoryLayers { - slots: Vec, - num_occupied: usize, - - // Head of free-slot list. - next_empty_slot_idx: Option, -} - -impl InMemoryLayers { - pub fn insert(&mut self, layer: Arc) -> LayerId { - let slot_idx = match self.next_empty_slot_idx { - Some(slot_idx) => slot_idx, - None => { - let idx = self.slots.len(); - self.slots.push(Slot { - tag: 0, - data: SlotData::Vacant(None), - usage_count: AtomicU8::new(0), - }); - idx - } - }; - let slots_len = self.slots.len(); - - let slot = &mut self.slots[slot_idx]; - - match slot.data { - SlotData::Occupied(_) => { - panic!("an occupied slot was in the free list"); - } - SlotData::Vacant(next_empty_slot_idx) => { - self.next_empty_slot_idx = next_empty_slot_idx; - } - } - - slot.data = SlotData::Occupied(layer); - slot.usage_count.store(1, Ordering::Relaxed); - - self.num_occupied += 1; - assert!(self.num_occupied <= slots_len); - - LayerId { - index: slot_idx, - tag: slot.tag, - } - } - - pub fn get(&self, layer_id: &LayerId) -> Option> { - let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic? - if slot.tag != layer_id.tag { - return None; - } - - if let SlotData::Occupied(layer) = &slot.data { - let _ = slot.usage_count.fetch_update( - Ordering::Relaxed, - Ordering::Relaxed, - |old_usage_count| { - if old_usage_count < MAX_USAGE_COUNT { - Some(old_usage_count + 1) - } else { - None - } - }, - ); - Some(Arc::clone(layer)) - } else { - None - } - } - - // TODO this won't be a public API in the future - pub fn remove(&mut self, layer_id: &LayerId) { - let slot = &mut self.slots[layer_id.index]; - - if slot.tag != layer_id.tag { - return; - } - - match &slot.data { - SlotData::Occupied(_layer) => { - // TODO evict the layer - } - SlotData::Vacant(_) => unimplemented!(), - } - - slot.data = SlotData::Vacant(self.next_empty_slot_idx); - self.next_empty_slot_idx = Some(layer_id.index); - - assert!(self.num_occupied > 0); - self.num_occupied -= 1; - - slot.tag = slot.tag.wrapping_add(1); - } -} diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 24445ff7e9..c13293105a 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -4,38 +4,34 @@ //! On disk, the image files are stored in timelines/ directory. //! Currently, there are no subdirectories, and each image layer file is named like this: //! -//! Note that segno is -//! _____ +//! -__ //! //! For example: //! -//! 1663_13990_2609_0_5_000000000169C348 +//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568 //! //! An image file is constructed using the 'bookfile' crate. //! //! Only metadata is loaded into memory by the load function. //! When images are needed, they are read directly from disk. //! -//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER. -//! All the images are required to be BLOCK_SIZE, which allows for random access. -//! -//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER. -//! use crate::config::PageServerConf; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag, + Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::RELISH_SEG_SIZE; +use crate::layered_repository::utils; +use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{anyhow, bail, ensure, Context, Result}; +use anyhow::{bail, Context, Result}; use bytes::Bytes; use log::*; use serde::{Deserialize, Serialize}; -use std::convert::TryInto; +use std::collections::{HashMap, HashSet}; use std::fs; use std::io::{BufWriter, Write}; +use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::{Mutex, MutexGuard}; @@ -44,12 +40,16 @@ use bookfile::{Book, BookWriter, ChapterWriter}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; -// Magic constant to identify a Zenith segment image file +// Magic constant to identify a Zenith image layer file +// FIXME: bump all magics pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1; +/// Mapping from (key, lsn) -> page/WAL record +/// byte ranges in VALUES_CHAPTER +static INDEX_CHAPTER: u64 = 1; + /// Contains each block in block # order -const BLOCKY_IMAGES_CHAPTER: u64 = 1; -const NONBLOCKY_IMAGE_CHAPTER: u64 = 2; +const VALUES_CHAPTER: u64 = 2; /// Contains the [`Summary`] struct const SUMMARY_CHAPTER: u64 = 3; @@ -58,7 +58,7 @@ const SUMMARY_CHAPTER: u64 = 3; struct Summary { tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, + key_range: Range, lsn: Lsn, } @@ -68,19 +68,17 @@ impl From<&ImageLayer> for Summary { Self { tenantid: layer.tenantid, timelineid: layer.timelineid, - seg: layer.seg, + key_range: layer.key_range.clone(), lsn: layer.lsn, } } } -const BLOCK_SIZE: usize = 8192; - /// /// ImageLayer is the in-memory data structure associated with an on-disk image /// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a -/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'. +/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'. /// Otherwise the struct is just a placeholder for a file that exists on disk, /// and it needs to be loaded before using it in queries. /// @@ -88,7 +86,7 @@ pub struct ImageLayer { path_or_conf: PathOrConf, pub tenantid: ZTenantId, pub timelineid: ZTimelineId, - pub seg: SegmentTag, + pub key_range: Range, // This entry contains an image of all pages as of this LSN pub lsn: Lsn, @@ -96,18 +94,15 @@ pub struct ImageLayer { inner: Mutex, } -#[derive(Clone)] -enum ImageType { - Blocky { num_blocks: SegmentBlk }, - NonBlocky, -} - pub struct ImageLayerInner { - /// If None, the 'image_type' has not been loaded into memory yet. + /// If false, the 'index' has not been loaded into memory yet. + loaded: bool, + + /// If None, the 'image_type' has not been loaded into memory yet. FIXME book: Option>, - /// Derived from filename and bookfile chapter metadata - image_type: ImageType, + /// offset of each value + index: HashMap, } impl Layer for ImageLayer { @@ -123,98 +118,80 @@ impl Layer for ImageLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + self.key_range.clone() } - fn is_dropped(&self) -> bool { - false - } - - fn get_start_lsn(&self) -> Lsn { - self.lsn - } - - fn get_end_lsn(&self) -> Lsn { + fn get_lsn_range(&self) -> Range { // End-bound is exclusive - self.lsn + 1 + self.lsn..(self.lsn + 1) } /// Look up given page in the file - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - assert!(lsn >= self.lsn); + lsn_floor: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> Result { + assert!(lsn_floor <= self.lsn); + assert!(self.key_range.contains(&reconstruct_state.key)); + assert!(reconstruct_state.lsn >= self.lsn); - match reconstruct_data.page_img { + match reconstruct_state.img { Some((cached_lsn, _)) if self.lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) + reconstruct_state.lsn = cached_lsn; + return Ok(ValueReconstructResult::Complete); } _ => {} } let inner = self.load()?; - let buf = match &inner.image_type { - ImageType::Blocky { num_blocks } => { - // Check if the request is beyond EOF - if blknum >= *num_blocks { - return Ok(PageReconstructResult::Missing(lsn)); - } + if let Some(offset) = inner.index.get(&reconstruct_state.key) { + let chapter = inner + .book + .as_ref() + .unwrap() + .chapter_reader(VALUES_CHAPTER)?; - let mut buf = vec![0u8; BLOCK_SIZE]; - let offset = BLOCK_SIZE as u64 * blknum as u64; + let blob = utils::read_blob_from_chapter(&chapter, *offset).with_context(|| { + format!( + "failed to read value from data file {} at offset {}", + self.filename().display(), + offset + ) + })?; + let value = Bytes::from(blob); - let chapter = inner - .book - .as_ref() - .unwrap() - .chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - - chapter.read_exact_at(&mut buf, offset).with_context(|| { - format!( - "failed to read page from data file {} at offset {}", - self.filename().display(), - offset - ) - })?; - - buf - } - ImageType::NonBlocky => { - ensure!(blknum == 0); - inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)? - .into_vec() - } - }; - - reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf))); - Ok(PageReconstructResult::Complete) - } - - /// Get size of the segment - fn get_seg_size(&self, _lsn: Lsn) -> Result { - let inner = self.load()?; - match inner.image_type { - ImageType::Blocky { num_blocks } => Ok(num_blocks), - ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")), + reconstruct_state.img = Some((self.lsn, value)); + reconstruct_state.lsn = self.lsn; + Ok(ValueReconstructResult::Complete) + } else { + reconstruct_state.lsn = self.lsn; + Ok(ValueReconstructResult::Missing) } } - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, _lsn: Lsn) -> Result { - Ok(true) + fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { + let inner = self.load()?; + + let index = &inner.index; + + keys.extend(index.keys().filter(|x| key_range.contains(x))); + Ok(()) } + fn iter(&self) -> Box>> { + todo!(); + } + fn unload(&self) -> Result<()> { + // TODO: unload 'segs'. Or even better, don't hold it in memory but + // access it directly from the file (using the buffer cache) + let mut inner = self.inner.lock().unwrap(); + inner.index = HashMap::default(); + inner.loaded = false; + Ok(()) } @@ -235,22 +212,17 @@ impl Layer for ImageLayer { /// debugging function to print out the contents of the layer fn dump(&self) -> Result<()> { println!( - "----- image layer for ten {} tli {} seg {} at {} ----", - self.tenantid, self.timelineid, self.seg, self.lsn + "----- image layer for ten {} tli {} key {}-{} at {} ----", + self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn ); let inner = self.load()?; - match inner.image_type { - ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks), - ImageType::NonBlocky => { - let chapter = inner - .book - .as_ref() - .unwrap() - .read_chapter(NONBLOCKY_IMAGE_CHAPTER)?; - println!("non-blocky ({} bytes)", chapter.len()); - } + let mut index_vec: Vec<(&Key, &u64)> = inner.index.iter().collect(); + index_vec.sort_by_key(|x| x.1); + + for (key, offset) in index_vec { + println!("key: {} offset {}", key, offset); } Ok(()) @@ -279,19 +251,21 @@ impl ImageLayer { // quick exit if already loaded let mut inner = self.inner.lock().unwrap(); - if inner.book.is_some() { + if inner.loaded { return Ok(inner); } let path = self.path(); - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open virtual file '{}'", path.display()))?; - let book = Book::new(file).with_context(|| { - format!( - "Failed to open virtual file '{}' as a bookfile", - path.display() - ) - })?; + + // Open the file if it's not open already. + if inner.book.is_none() { + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + inner.book = Some(Book::new(file).with_context(|| { + format!("Failed to open file '{}' as a bookfile", path.display()) + })?); + } + let book = inner.book.as_ref().unwrap(); match &self.path_or_conf { PathOrConf::Conf(_) => { @@ -318,23 +292,13 @@ impl ImageLayer { } } - let image_type = if self.seg.rel.is_blocky() { - let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?; - let images_len = chapter.len(); - ensure!(images_len % BLOCK_SIZE as u64 == 0); - let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?; - ImageType::Blocky { num_blocks } - } else { - let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?; - ImageType::NonBlocky - }; + let chapter = book.read_chapter(INDEX_CHAPTER)?; + let index = HashMap::des(&chapter)?; - debug!("loaded from {}", &path.display()); + info!("loaded from {}", &path.display()); - *inner = ImageLayerInner { - book: Some(book), - image_type, - }; + inner.index = index; + inner.loaded = true; Ok(inner) } @@ -350,11 +314,12 @@ impl ImageLayer { path_or_conf: PathOrConf::Conf(conf), timelineid, tenantid, - seg: filename.seg, + key_range: filename.key_range.clone(), lsn: filename.lsn, inner: Mutex::new(ImageLayerInner { book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, + index: HashMap::new(), + loaded: false, }), } } @@ -373,18 +338,19 @@ impl ImageLayer { path_or_conf: PathOrConf::Path(path.to_path_buf()), timelineid: summary.timelineid, tenantid: summary.tenantid, - seg: summary.seg, + key_range: summary.key_range, lsn: summary.lsn, inner: Mutex::new(ImageLayerInner { book: None, - image_type: ImageType::Blocky { num_blocks: 0 }, + index: HashMap::new(), + loaded: false, }), }) } fn layer_name(&self) -> ImageFileName { ImageFileName { - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, } } @@ -413,15 +379,18 @@ impl ImageLayer { /// pub struct ImageLayerWriter { conf: &'static PageServerConf, + path: PathBuf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, + key_range: Range, lsn: Lsn, - num_blocks: SegmentBlk, + values_writer: Option>>, + end_offset: u64, - page_image_writer: ChapterWriter>, - num_blocks_written: SegmentBlk, + index: HashMap, + + finished: bool, } impl ImageLayerWriter { @@ -429,9 +398,8 @@ impl ImageLayerWriter { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, + key_range: &Range, lsn: Lsn, - num_blocks: SegmentBlk, ) -> Result { // Create the file // @@ -441,70 +409,74 @@ impl ImageLayerWriter { &PathOrConf::Conf(conf), timelineid, tenantid, - &ImageFileName { seg, lsn }, + &ImageFileName { + key_range: key_range.clone(), + lsn, + }, ); + info!("new image layer {}", path.display()); let file = VirtualFile::create(&path)?; let buf_writer = BufWriter::new(file); let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?; // Open the page-images chapter for writing. The calls to - // `put_page_image` will use this to write the contents. - let chapter = if seg.rel.is_blocky() { - book.new_chapter(BLOCKY_IMAGES_CHAPTER) - } else { - assert_eq!(num_blocks, 1); - book.new_chapter(NONBLOCKY_IMAGE_CHAPTER) - }; + // `put_image` will use this to write the contents. + let chapter = book.new_chapter(VALUES_CHAPTER); let writer = ImageLayerWriter { conf, + path, timelineid, tenantid, - seg, + key_range: key_range.clone(), lsn, - num_blocks, - page_image_writer: chapter, - num_blocks_written: 0, + values_writer: Some(chapter), + index: HashMap::new(), + end_offset: 0, + finished: false, }; Ok(writer) } /// - /// Write next page image to the file. + /// Write next value to the file. /// /// The page versions must be appended in blknum order. /// - pub fn put_page_image(&mut self, block_bytes: &[u8]) -> Result<()> { - assert!(self.num_blocks_written < self.num_blocks); - if self.seg.rel.is_blocky() { - assert_eq!(block_bytes.len(), BLOCK_SIZE); + pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> { + assert!(self.key_range.contains(&key)); + let off = self.end_offset; + + if let Some(writer) = &mut self.values_writer { + let len = utils::write_blob(writer, img)?; + self.end_offset += len; + + let old = self.index.insert(key, off); + assert!(old.is_none()); + } else { + panic!() } - self.page_image_writer.write_all(block_bytes)?; - self.num_blocks_written += 1; + Ok(()) } - pub fn finish(self) -> Result { - // Check that the `put_page_image' was called for every block. - assert!(self.num_blocks_written == self.num_blocks); + pub fn finish(&mut self) -> Result { + // Close the values chapter + let book = self.values_writer.take().unwrap().close()?; - // Close the page-images chapter - let book = self.page_image_writer.close()?; + // Write out the index + let mut chapter = book.new_chapter(INDEX_CHAPTER); + let buf = HashMap::ser(&self.index)?; + chapter.write_all(&buf)?; + let book = chapter.close()?; // Write out the summary chapter - let image_type = if self.seg.rel.is_blocky() { - ImageType::Blocky { - num_blocks: self.num_blocks, - } - } else { - ImageType::NonBlocky - }; let mut chapter = book.new_chapter(SUMMARY_CHAPTER); let summary = Summary { tenantid: self.tenantid, timelineid: self.timelineid, - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, }; Summary::ser_into(&summary, &mut chapter)?; @@ -520,15 +492,31 @@ impl ImageLayerWriter { path_or_conf: PathOrConf::Conf(self.conf), timelineid: self.timelineid, tenantid: self.tenantid, - seg: self.seg, + key_range: self.key_range.clone(), lsn: self.lsn, inner: Mutex::new(ImageLayerInner { book: None, - image_type, + loaded: false, + index: HashMap::new(), }), }; trace!("created image layer {}", layer.path().display()); + self.finished = true; + Ok(layer) } } + +impl Drop for ImageLayerWriter { + fn drop(&mut self) { + if let Some(page_image_writer) = self.values_writer.take() { + if let Ok(book) = page_image_writer.close() { + let _ = book.close(); + } + } + if !self.finished { + let _ = fs::remove_file(&self.path); + } + } +} diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 6e24bf6022..c3ca2fd091 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -1,30 +1,24 @@ -//! An in-memory layer stores recently received PageVersions. -//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited -//! and layers can be spilled to disk into ephemeral files. +//! An in-memory layer stores recently received key-value pairs. //! -//! And there's another BTreeMap to track the size of the relation. +//! The "in-memory" part of the name is a bit misleading: the actual page versions are +//! held in an ephemeral file, not in memory. The metadata for each page version, i.e. +//! its position in the file, is kept in memory, though. //! use crate::config::PageServerConf; use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::layered_repository::ephemeral_file::EphemeralFile; -use crate::layered_repository::filename::DeltaFileName; -use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter}; use crate::layered_repository::storage_layer::{ - Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag, - RELISH_SEG_SIZE, + Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::LayeredTimeline; -use crate::layered_repository::ZERO_PAGE; -use crate::repository::ZenithWalRecord; +use crate::layered_repository::utils; +use crate::repository::{Key, Value}; use crate::{ZTenantId, ZTimelineId}; -use anyhow::{ensure, Result}; -use bytes::Bytes; +use anyhow::Result; use log::*; -use std::collections::HashMap; -use std::io::Seek; -use std::os::unix::fs::FileExt; +use std::collections::{HashMap, HashSet}; +use std::ops::Range; use std::path::PathBuf; -use std::sync::{Arc, RwLock}; +use std::sync::RwLock; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; use zenith_utils::vec_map::VecMap; @@ -33,7 +27,6 @@ pub struct InMemoryLayer { conf: &'static PageServerConf, tenantid: ZTenantId, timelineid: ZTimelineId, - seg: SegmentTag, /// /// This layer contains all the changes from 'start_lsn'. The @@ -42,7 +35,7 @@ pub struct InMemoryLayer { start_lsn: Lsn, /// - /// LSN of the oldest page version stored in this layer. + /// LSN of the oldest value stored in this layer. /// /// This is different from 'start_lsn' in that we enforce that the 'start_lsn' /// of a layer always matches the 'end_lsn' of its predecessor, even if there @@ -59,9 +52,6 @@ pub struct InMemoryLayer { /// The above fields never change. The parts that do change are in 'inner', /// and protected by mutex. inner: RwLock, - - /// Predecessor layer might be needed? - incremental: bool, } pub struct InMemoryLayerInner { @@ -69,98 +59,25 @@ pub struct InMemoryLayerInner { /// Writes are only allowed when this is None end_lsn: Option, - /// If this relation was dropped, remember when that happened. - /// The drop LSN is recorded in [`end_lsn`]. - dropped: bool, + /// + /// All versions of all pages in the layer are kept here. Indexed + /// by block number and LSN. The value is an offset into the + /// ephemeral file where the page version is stored. + /// + index: HashMap>, - /// The PageVersion structs are stored in a serialized format in this file. - /// Each serialized PageVersion is preceded by a 'u32' length field. - /// 'page_versions' map stores offsets into this file. + /// The values are stored in a serialized format in this file. + /// Each serialized Value is preceded by a 'u32' length field. + /// PerSeg::page_versions map stores offsets into this file. file: EphemeralFile, - /// Metadata about all versions of all pages in the layer is kept - /// here. Indexed by block number and LSN. The value is an offset - /// into the ephemeral file where the page version is stored. - page_versions: HashMap>, - - /// - /// `seg_sizes` tracks the size of the segment at different points in time. - /// - /// For a blocky rel, there is always one entry, at the layer's start_lsn, - /// so that determining the size never depends on the predecessor layer. For - /// a non-blocky rel, 'seg_sizes' is not used and is always empty. - /// - seg_sizes: VecMap, - - /// - /// LSN of the newest page version stored in this layer. - /// - /// The difference between 'end_lsn' and 'latest_lsn' is the same as between - /// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'. - /// - latest_lsn: Lsn, + end_offset: u64, } impl InMemoryLayerInner { fn assert_writeable(&self) { assert!(self.end_lsn.is_none()); } - - fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk { - // Scan the BTreeMap backwards, starting from the given entry. - let slice = self.seg_sizes.slice_range(..=lsn); - - // We make sure there is always at least one entry - if let Some((_entry_lsn, entry)) = slice.last() { - *entry - } else { - panic!("could not find seg size in in-memory layer"); - } - } - - /// - /// Read a page version from the ephemeral file. - /// - fn read_pv(&self, off: u64) -> Result { - let mut buf = Vec::new(); - self.read_pv_bytes(off, &mut buf)?; - Ok(PageVersion::des(&buf)?) - } - - /// - /// Read a page version from the ephemeral file, as raw bytes, at - /// the given offset. The bytes are read into 'buf', which is - /// expanded if necessary. Returns the size of the page version. - /// - fn read_pv_bytes(&self, off: u64, buf: &mut Vec) -> Result { - // read length - let mut lenbuf = [0u8; 4]; - self.file.read_exact_at(&mut lenbuf, off)?; - let len = u32::from_ne_bytes(lenbuf) as usize; - - if buf.len() < len { - buf.resize(len, 0); - } - self.file.read_exact_at(&mut buf[0..len], off + 4)?; - Ok(len) - } - - fn write_pv(&mut self, pv: &PageVersion) -> Result { - // remember starting position - let pos = self.file.stream_position()?; - - // make room for the 'length' field by writing zeros as a placeholder. - self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap(); - - pv.ser_into(&mut self.file).unwrap(); - - // write the 'length' field. - let len = self.file.stream_position()? - pos - 4; - let lenbuf = u32::to_ne_bytes(len as u32); - self.file.write_all_at(&lenbuf, pos)?; - - Ok(pos) - } } impl Layer for InMemoryLayer { @@ -170,21 +87,12 @@ impl Layer for InMemoryLayer { fn filename(&self) -> PathBuf { let inner = self.inner.read().unwrap(); - let end_lsn = if let Some(drop_lsn) = inner.end_lsn { - drop_lsn - } else { - Lsn(u64::MAX) - }; + let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX)); - let delta_filename = DeltaFileName { - seg: self.seg, - start_lsn: self.start_lsn, - end_lsn, - dropped: inner.dropped, - } - .to_string(); - - PathBuf::from(format!("inmem-{}", delta_filename)) + PathBuf::from(format!( + "inmem-{:016X}-{:016X}", + self.start_lsn.0, end_lsn.0 + )) } fn get_tenant_id(&self) -> ZTenantId { @@ -195,132 +103,85 @@ impl Layer for InMemoryLayer { self.timelineid } - fn get_seg_tag(&self) -> SegmentTag { - self.seg + fn get_key_range(&self) -> Range { + Key::MIN..Key::MAX } - fn get_start_lsn(&self) -> Lsn { - self.start_lsn - } - - fn get_end_lsn(&self) -> Lsn { + fn get_lsn_range(&self) -> Range { let inner = self.inner.read().unwrap(); - if let Some(end_lsn) = inner.end_lsn { + let end_lsn = if let Some(end_lsn) = inner.end_lsn { end_lsn } else { Lsn(u64::MAX) - } + }; + self.start_lsn..end_lsn } - fn is_dropped(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.dropped - } - - /// Look up given page in the cache. - fn get_page_reconstruct_data( + /// Look up given value in the layer. + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result { + lsn_floor: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> Result { + assert!(lsn_floor <= self.start_lsn); let mut need_image = true; - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); + let inner = self.inner.read().unwrap(); - { - let inner = self.inner.read().unwrap(); - - // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.page_versions.get(&blknum) { - let slice = vec_map.slice_range(..=lsn); - for (entry_lsn, pos) in slice.iter().rev() { - match &reconstruct_data.page_img { - Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - return Ok(PageReconstructResult::Complete) - } - _ => {} + // Scan the page versions backwards, starting from `lsn`. + if let Some(vec_map) = inner.index.get(&reconstruct_state.key) { + let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn); + for (entry_lsn, pos) in slice.iter().rev() { + match &reconstruct_state.img { + Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { + return Ok(ValueReconstructResult::Complete) } + _ => {} + } - let pv = inner.read_pv(*pos)?; - match pv { - PageVersion::Page(img) => { - reconstruct_data.page_img = Some((*entry_lsn, img)); + let value = Value::des(&utils::read_blob(&inner.file, *pos)?)?; + match value { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); + + reconstruct_state.lsn = *entry_lsn; + return Ok(ValueReconstructResult::Complete); + } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back need_image = false; break; } - PageVersion::Wal(rec) => { - reconstruct_data.records.push((*entry_lsn, rec.clone())); - if rec.will_init() { - // This WAL record initializes the page, so no need to go further back - need_image = false; - break; - } - } } } } - - // If we didn't find any records for this, check if the request is beyond EOF - if need_image - && reconstruct_data.records.is_empty() - && self.seg.rel.is_blocky() - && blknum >= self.get_seg_size(lsn)? - { - return Ok(PageReconstructResult::Missing(self.start_lsn)); - } - - // release lock on 'inner' } + // release lock on 'inner' + // If an older page image is needed to reconstruct the page, let the - // caller know + // caller know. if need_image { - if self.incremental { - Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1))) - } else { - Ok(PageReconstructResult::Missing(self.start_lsn)) - } + reconstruct_state.lsn = Lsn(self.start_lsn.0 - 1); + Ok(ValueReconstructResult::Continue) } else { - Ok(PageReconstructResult::Complete) + Ok(ValueReconstructResult::Complete) } } - /// Get size of the relation at given LSN - fn get_seg_size(&self, lsn: Lsn) -> Result { - assert!(lsn >= self.start_lsn); - ensure!( - self.seg.rel.is_blocky(), - "get_seg_size() called on a non-blocky rel" - ); - + fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { let inner = self.inner.read().unwrap(); - Ok(inner.get_seg_size(lsn)) + + keys.extend(inner.index.keys().filter(|x| key_range.contains(x))); + Ok(()) } - /// Does this segment exist at given LSN? - fn get_seg_exists(&self, lsn: Lsn) -> Result { - let inner = self.inner.read().unwrap(); - - // If the segment created after requested LSN, - // it doesn't exist in the layer. But we shouldn't - // have requested it in the first place. - assert!(lsn >= self.start_lsn); - - // Is the requested LSN after the segment was dropped? - if inner.dropped { - if let Some(end_lsn) = inner.end_lsn { - if lsn >= end_lsn { - return Ok(false); - } - } else { - panic!("dropped in-memory layer with no end LSN"); - } - } - - // Otherwise, it exists - Ok(true) + fn iter(&self) -> Box>> { + todo!(); } /// Cannot unload anything in an in-memory layer, since there's no backing @@ -337,7 +198,8 @@ impl Layer for InMemoryLayer { } fn is_incremental(&self) -> bool { - self.incremental + // in-memory layer is always considered incremental. + true } fn is_in_memory(&self) -> bool { @@ -355,53 +217,39 @@ impl Layer for InMemoryLayer { .unwrap_or_default(); println!( - "----- in-memory layer for tli {} seg {} {}-{} {} ----", - self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped, + "----- in-memory layer for tli {} LSNs {}-{} ----", + self.timelineid, + self.start_lsn, + end_str, + //inner.dropped, ); - for (k, v) in inner.seg_sizes.as_slice() { - println!("seg_sizes {}: {}", k, v); - } + // FIXME + /* + for (blknum, versions) in page_versions { + for (lsn, off) in versions.as_slice() { + let pv = inner.read_pv(*off); + let pv_description = match pv { + Ok(PageVersion::Page(_img)) => "page", + Ok(PageVersion::Wal(_rec)) => "wal", + Err(_err) => "INVALID", + }; - // List the blocks in order - let mut page_versions: Vec<(&SegmentBlk, &VecMap)> = - inner.page_versions.iter().collect(); - page_versions.sort_by_key(|k| k.0); - - for (blknum, versions) in page_versions { - for (lsn, off) in versions.as_slice() { - let pv = inner.read_pv(*off); - let pv_description = match pv { - Ok(PageVersion::Page(_img)) => "page", - Ok(PageVersion::Wal(_rec)) => "wal", - Err(_err) => "INVALID", - }; - - println!("blk {} at {}: {}\n", blknum, lsn, pv_description); - } - } + println!("blk {} at {}: {}\n", blknum, lsn, pv_description); + } + } + */ Ok(()) } } -/// A result of an inmemory layer data being written to disk. -pub struct LayersOnDisk { - pub delta_layers: Vec, - pub image_layers: Vec, -} - impl InMemoryLayer { /// Return the oldest page version that's stored in this layer pub fn get_oldest_lsn(&self) -> Lsn { self.oldest_lsn } - pub fn get_latest_lsn(&self) -> Lsn { - let inner = self.inner.read().unwrap(); - inner.latest_lsn - } - /// /// Create a new, empty, in-memory layer /// @@ -409,268 +257,88 @@ impl InMemoryLayer { conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId, - seg: SegmentTag, start_lsn: Lsn, oldest_lsn: Lsn, ) -> Result { trace!( - "initializing new empty InMemoryLayer for writing {} on timeline {} at {}", - seg, + "initializing new empty InMemoryLayer for writing on timeline {} at {}", timelineid, start_lsn ); - // The segment is initially empty, so initialize 'seg_sizes' with 0. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - seg_sizes.append(start_lsn, 0).unwrap(); - } - let file = EphemeralFile::create(conf, tenantid, timelineid)?; Ok(InMemoryLayer { conf, timelineid, tenantid, - seg, start_lsn, oldest_lsn, - incremental: false, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, - dropped: false, + index: HashMap::new(), file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, + end_offset: 0, }), }) } // Write operations - /// Remember new page version, as a WAL record over previous version - pub fn put_wal_record( - &self, - lsn: Lsn, - blknum: SegmentBlk, - rec: ZenithWalRecord, - ) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Wal(rec)) - } - - /// Remember new page version, as a full page image - pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result { - self.put_page_version(blknum, lsn, PageVersion::Page(img)) - } - /// Common subroutine of the public put_wal_record() and put_page_image() functions. /// Adds the page version to the in-memory tree - pub fn put_page_version(&self, blknum: SegmentBlk, lsn: Lsn, pv: PageVersion) -> Result { - assert!((0..RELISH_SEG_SIZE).contains(&blknum)); - - trace!( - "put_page_version blk {} of {} at {}/{}", - blknum, - self.seg.rel, - self.timelineid, - lsn - ); + pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> { + trace!("put_value key {} at {}/{}", key, self.timelineid, lsn); let mut inner = self.inner.write().unwrap(); inner.assert_writeable(); - assert!(lsn >= inner.latest_lsn); - inner.latest_lsn = lsn; - // Write the page version to the file, and remember its offset in 'page_versions' - { - let off = inner.write_pv(&pv)?; - let vec_map = inner.page_versions.entry(blknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!( - "Page version of rel {} blk {} at {} already exists", - self.seg.rel, blknum, lsn - ); - } - } - - // Also update the relation size, if this extended the relation. - if self.seg.rel.is_blocky() { - let newsize = blknum + 1; - - // use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock, - // which we've just acquired above - let oldsize = inner.get_seg_size(lsn); - if newsize > oldsize { - trace!( - "enlarging segment {} from {} to {} blocks at {}", - self.seg, - oldsize, - newsize, - lsn - ); - - // If we are extending the relation by more than one page, initialize the "gap" - // with zeros - // - // XXX: What if the caller initializes the gap with subsequent call with same LSN? - // I don't think that can happen currently, but that is highly dependent on how - // PostgreSQL writes its WAL records and there's no guarantee of it. If it does - // happen, we would hit the "page version already exists" warning above on the - // subsequent call to initialize the gap page. - for gapblknum in oldsize..blknum { - let zeropv = PageVersion::Page(ZERO_PAGE.clone()); - trace!( - "filling gap blk {} with zeros for write of {}", - gapblknum, - blknum - ); - - // Write the page version to the file, and remember its offset in - // 'page_versions' - { - let off = inner.write_pv(&zeropv)?; - let vec_map = inner.page_versions.entry(gapblknum).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; - if old.is_some() { - warn!( - "Page version of seg {} blk {} at {} already exists", - self.seg, gapblknum, lsn - ); - } - } - } - - inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap(); - return Ok(newsize - oldsize); - } - } - - Ok(0) - } - - /// Remember that the relation was truncated at given LSN - pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) { - assert!( - self.seg.rel.is_blocky(), - "put_truncation() called on a non-blocky rel" - ); - - let mut inner = self.inner.write().unwrap(); - inner.assert_writeable(); - - // check that this we truncate to a smaller size than segment was before the truncation - let old_size = inner.get_seg_size(lsn); - assert!(new_size < old_size); - - let (old, _delta_size) = inner - .seg_sizes - .append_or_update_last(lsn, new_size) - .unwrap(); + let off = inner.end_offset; + let len = utils::write_blob(&mut inner.file, &Value::ser(&val)?)?; + inner.end_offset += len; + let vec_map = inner.index.entry(key).or_default(); + let old = vec_map.append_or_update_last(lsn, off).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. - warn!("Inserting truncation, but had an entry for the LSN already"); - } - } - - /// Remember that the segment was dropped at given LSN - pub fn drop_segment(&self, lsn: Lsn) { - let mut inner = self.inner.write().unwrap(); - - assert!(inner.end_lsn.is_none()); - assert!(!inner.dropped); - inner.dropped = true; - assert!(self.start_lsn < lsn); - inner.end_lsn = Some(lsn); - - trace!("dropped segment {} at {}", self.seg, lsn); - } - - /// - /// Initialize a new InMemoryLayer for, by copying the state at the given - /// point in time from given existing layer. - /// - pub fn create_successor_layer( - conf: &'static PageServerConf, - src: Arc, - timelineid: ZTimelineId, - tenantid: ZTenantId, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Result { - let seg = src.get_seg_tag(); - - assert!(oldest_lsn.is_aligned()); - - trace!( - "initializing new InMemoryLayer for writing {} on timeline {} at {}", - seg, - timelineid, - start_lsn, - ); - - // Copy the segment size at the start LSN from the predecessor layer. - let mut seg_sizes = VecMap::default(); - if seg.rel.is_blocky() { - let size = src.get_seg_size(start_lsn)?; - seg_sizes.append(start_lsn, size).unwrap(); + warn!("Key {} at {} already exists", key, lsn); } - let file = EphemeralFile::create(conf, tenantid, timelineid)?; - - Ok(InMemoryLayer { - conf, - timelineid, - tenantid, - seg, - start_lsn, - oldest_lsn, - incremental: true, - inner: RwLock::new(InMemoryLayerInner { - end_lsn: None, - dropped: false, - file, - page_versions: HashMap::new(), - seg_sizes, - latest_lsn: oldest_lsn, - }), - }) + Ok(()) } - pub fn is_writeable(&self) -> bool { - let inner = self.inner.read().unwrap(); - inner.end_lsn.is_none() + pub fn put_tombstone(&self, _key_range: Range, _lsn: Lsn) -> Result<()> { + // TODO: Currently, we just leak the storage for any deleted keys + + Ok(()) } /// Make the layer non-writeable. Only call once. /// Records the end_lsn for non-dropped layers. - /// `end_lsn` is inclusive + /// `end_lsn` is exclusive pub fn freeze(&self, end_lsn: Lsn) { let mut inner = self.inner.write().unwrap(); - if inner.end_lsn.is_some() { - assert!(inner.dropped); - } else { - assert!(!inner.dropped); - assert!(self.start_lsn < end_lsn + 1); - inner.end_lsn = Some(Lsn(end_lsn.0 + 1)); + assert!(self.start_lsn < end_lsn); + inner.end_lsn = Some(end_lsn); - if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() { - assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn); - } + // FIXME + /* + for perseg in inner.segs.values() { + if let Some((lsn, _)) = perseg.seg_sizes.as_slice().last() { + assert!(lsn < &end_lsn, "{:?} {:?}", lsn, end_lsn); + } - for (_blk, vec_map) in inner.page_versions.iter() { - for (lsn, _pos) in vec_map.as_slice() { - assert!(*lsn <= end_lsn); + for (_blk, vec_map) in perseg.page_versions.iter() { + for (lsn, _pos) in vec_map.as_slice() { + assert!(*lsn < end_lsn); + } + } } - } - } + */ } - /// Write the this frozen in-memory layer to disk. + /// Write this frozen in-memory layer to disk. /// /// Returns new layers that replace this one. /// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions @@ -678,17 +346,7 @@ impl InMemoryLayer { /// WAL records between start and end LSN. (The delta layer is not needed /// when a new relish is created with a single LSN, so that the start and /// end LSN are the same.) - pub fn write_to_disk( - &self, - timeline: &LayeredTimeline, - reconstruct_pages: bool, - ) -> Result { - trace!( - "write_to_disk {} get_end_lsn is {}", - self.filename().display(), - self.get_end_lsn() - ); - + pub fn write_to_disk(&self) -> Result { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the // write lock on it, so we shouldn't block anyone. There's one exception @@ -700,105 +358,30 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().unwrap(); - // Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN - // that is included. - let end_lsn_exclusive = inner.end_lsn.unwrap(); - let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1); + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timelineid, + self.tenantid, + Key::MIN, + self.start_lsn..inner.end_lsn.unwrap(), + )?; - // Figure out if we should create a delta layer, image layer, or both. - let image_lsn: Option; - let delta_end_lsn: Option; - if self.is_dropped() || !reconstruct_pages { - // The segment was dropped. Create just a delta layer containing all the - // changes up to and including the drop. - delta_end_lsn = Some(end_lsn_exclusive); - image_lsn = None; - } else if self.start_lsn == end_lsn_inclusive { - // The layer contains exactly one LSN. It's enough to write an image - // layer at that LSN. - delta_end_lsn = None; - image_lsn = Some(end_lsn_inclusive); - } else { - // Create a delta layer with all the changes up to the end LSN, - // and an image layer at the end LSN. - // - // Note that we the delta layer does *not* include the page versions - // at the end LSN. They are included in the image layer, and there's - // no need to store them twice. - delta_end_lsn = Some(end_lsn_inclusive); - image_lsn = Some(end_lsn_inclusive); - } - - let mut delta_layers = Vec::new(); - let mut image_layers = Vec::new(); - - if let Some(delta_end_lsn) = delta_end_lsn { - let mut delta_layer_writer = DeltaLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - self.start_lsn, - delta_end_lsn, - self.is_dropped(), - )?; - - // Write all page versions, in block + LSN order - let mut buf: Vec = Vec::new(); - - let pv_iter = inner.page_versions.iter(); - let mut pages: Vec<(&SegmentBlk, &VecMap)> = pv_iter.collect(); - pages.sort_by_key(|(blknum, _vec_map)| *blknum); - for (blknum, vec_map) in pages { + let mut do_steps = || -> Result<()> { + for (key, vec_map) in inner.index.iter() { + // Write all page versions for (lsn, pos) in vec_map.as_slice() { - if *lsn < delta_end_lsn { - let len = inner.read_pv_bytes(*pos, &mut buf)?; - delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?; - } + let val = Value::des(&utils::read_blob(&inner.file, *pos)?)?; + delta_layer_writer.put_value(*key, *lsn, val)?; } } - - // Create seg_sizes - let seg_sizes = if delta_end_lsn == end_lsn_exclusive { - inner.seg_sizes.clone() - } else { - inner.seg_sizes.split_at(&end_lsn_exclusive).0 - }; - - let delta_layer = delta_layer_writer.finish(seg_sizes)?; - delta_layers.push(delta_layer); + Ok(()) + }; + if let Err(err) = do_steps() { + delta_layer_writer.abort(); + return Err(err); } - drop(inner); - - // Write a new base image layer at the cutoff point - if let Some(image_lsn) = image_lsn { - let size = if self.seg.rel.is_blocky() { - self.get_seg_size(image_lsn)? - } else { - 1 - }; - let mut image_layer_writer = ImageLayerWriter::new( - self.conf, - self.timelineid, - self.tenantid, - self.seg, - image_lsn, - size, - )?; - - for blknum in 0..size { - let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?; - - image_layer_writer.put_page_image(&img)?; - } - let image_layer = image_layer_writer.finish()?; - image_layers.push(image_layer); - } - - Ok(LayersOnDisk { - delta_layers, - image_layers, - }) + let delta_layer = delta_layer_writer.finish(Key::MAX)?; + Ok(delta_layer) } } diff --git a/pageserver/src/layered_repository/interval_tree.rs b/pageserver/src/layered_repository/interval_tree.rs deleted file mode 100644 index 978ecd837e..0000000000 --- a/pageserver/src/layered_repository/interval_tree.rs +++ /dev/null @@ -1,468 +0,0 @@ -/// -/// IntervalTree is data structure for holding intervals. It is generic -/// to make unit testing possible, but the only real user of it is the layer map, -/// -/// It's inspired by the "segment tree" or a "statistic tree" as described in -/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold -/// the points instead of a binary tree. This is called an "interval tree" instead -/// of "segment tree" because the term "segment" is already using Zenith to mean -/// something else. To add to the confusion, there is another data structure known -/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree), -/// for storing intervals, but this isn't that. -/// -/// The basic idea is to have a B-tree of "interesting Points". At each Point, -/// there is a list of intervals that contain the point. The Points are formed -/// from the start bounds of each interval; there is a Point for each distinct -/// start bound. -/// -/// Operations: -/// -/// To find intervals that contain a given point, you search the b-tree to find -/// the nearest Point <= search key. Then you just return the list of intervals. -/// -/// To insert an interval, find the Point with start key equal to the inserted item. -/// If the Point doesn't exist yet, create it, by copying all the items from the -/// previous Point that cover the new Point. Then walk right, inserting the new -/// interval to all the Points that are contained by the new interval (including the -/// newly created Point). -/// -/// To remove an interval, you scan the tree for all the Points that are contained by -/// the removed interval, and remove it from the list in each Point. -/// -/// Requirements and assumptions: -/// -/// - Can store overlapping items -/// - But there are not many overlapping items -/// - The interval bounds don't change after it is added to the tree -/// - Intervals are uniquely identified by pointer equality. You must not be insert the -/// same interval object twice, and `remove` uses pointer equality to remove the right -/// interval. It is OK to have two intervals with the same bounds, however. -/// -use std::collections::BTreeMap; -use std::fmt::Debug; -use std::ops::Range; -use std::sync::Arc; - -pub struct IntervalTree -where - I: IntervalItem, -{ - points: BTreeMap>, -} - -struct Point { - /// All intervals that contain this point, in no particular order. - /// - /// We assume that there aren't a lot of overlappingg intervals, so that this vector - /// never grows very large. If that assumption doesn't hold, we could keep this ordered - /// by the end bound, to speed up `search`. But as long as there are only a few elements, - /// a linear search is OK. - elements: Vec>, -} - -/// Abstraction for an interval that can be stored in the tree -/// -/// The start bound is inclusive and the end bound is exclusive. End must be greater -/// than start. -pub trait IntervalItem { - type Key: Ord + Copy + Debug + Sized; - - fn start_key(&self) -> Self::Key; - fn end_key(&self) -> Self::Key; - - fn bounds(&self) -> Range { - self.start_key()..self.end_key() - } -} - -impl IntervalTree -where - I: IntervalItem, -{ - /// Return an element that contains 'key', or precedes it. - /// - /// If there are multiple candidates, returns the one with the highest 'end' key. - pub fn search(&self, key: I::Key) -> Option> { - // Find the greatest point that precedes or is equal to the search key. If there is - // none, returns None. - let (_, p) = self.points.range(..=key).next_back()?; - - // Find the element with the highest end key at this point - let highest_item = p - .elements - .iter() - .reduce(|a, b| { - // starting with Rust 1.53, could use `std::cmp::min_by_key` here - if a.end_key() > b.end_key() { - a - } else { - b - } - }) - .unwrap(); - Some(Arc::clone(highest_item)) - } - - /// Iterate over all items with start bound >= 'key' - pub fn iter_newer(&self, key: I::Key) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(key..), - elem_iter: None, - } - } - - /// Iterate over all items - pub fn iter(&self) -> IntervalIter { - IntervalIter { - point_iter: self.points.range(..), - elem_iter: None, - } - } - - pub fn insert(&mut self, item: Arc) { - let start_key = item.start_key(); - let end_key = item.end_key(); - assert!(start_key < end_key); - let bounds = start_key..end_key; - - // Find the starting point and walk forward from there - let mut found_start_point = false; - let iter = self.points.range_mut(bounds); - for (point_key, point) in iter { - if *point_key == start_key { - found_start_point = true; - // It is an error to insert the same item to the tree twice. - assert!( - !point.elements.iter().any(|x| Arc::ptr_eq(x, &item)), - "interval is already in the tree" - ); - } - point.elements.push(Arc::clone(&item)); - } - if !found_start_point { - // Create a new Point for the starting point - - // Look at the previous point, and copy over elements that overlap with this - // new point - let mut new_elements: Vec> = Vec::new(); - if let Some((_, prev_point)) = self.points.range(..start_key).next_back() { - let overlapping_prev_elements = prev_point - .elements - .iter() - .filter(|x| x.bounds().contains(&start_key)) - .cloned(); - - new_elements.extend(overlapping_prev_elements); - } - new_elements.push(item); - - let new_point = Point { - elements: new_elements, - }; - self.points.insert(start_key, new_point); - } - } - - pub fn remove(&mut self, item: &Arc) { - // range search points - let start_key = item.start_key(); - let end_key = item.end_key(); - let bounds = start_key..end_key; - - let mut points_to_remove: Vec = Vec::new(); - let mut found_start_point = false; - for (point_key, point) in self.points.range_mut(bounds) { - if *point_key == start_key { - found_start_point = true; - } - let len_before = point.elements.len(); - point.elements.retain(|other| !Arc::ptr_eq(other, item)); - let len_after = point.elements.len(); - assert_eq!(len_after + 1, len_before); - if len_after == 0 { - points_to_remove.push(*point_key); - } - } - assert!(found_start_point); - - for k in points_to_remove { - self.points.remove(&k).unwrap(); - } - } -} - -pub struct IntervalIter<'a, I: ?Sized> -where - I: IntervalItem, -{ - point_iter: std::collections::btree_map::Range<'a, I::Key, Point>, - elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc>)>, -} - -impl<'a, I> Iterator for IntervalIter<'a, I> -where - I: IntervalItem + ?Sized, -{ - type Item = Arc; - - fn next(&mut self) -> Option { - // Iterate over all elements in all the points in 'point_iter'. To avoid - // returning the same element twice, we only return each element at its - // starting point. - loop { - // Return next remaining element from the current point - if let Some((point_key, elem_iter)) = &mut self.elem_iter { - for elem in elem_iter { - if elem.start_key() == *point_key { - return Some(Arc::clone(elem)); - } - } - } - // No more elements at this point. Move to next point. - if let Some((point_key, point)) = self.point_iter.next() { - self.elem_iter = Some((*point_key, point.elements.iter())); - continue; - } else { - // No more points, all done - return None; - } - } - } -} - -impl Default for IntervalTree -where - I: IntervalItem, -{ - fn default() -> Self { - IntervalTree { - points: BTreeMap::new(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fmt; - - #[derive(Debug)] - struct MockItem { - start_key: u32, - end_key: u32, - val: String, - } - impl IntervalItem for MockItem { - type Key = u32; - - fn start_key(&self) -> u32 { - self.start_key - } - fn end_key(&self) -> u32 { - self.end_key - } - } - impl MockItem { - fn new(start_key: u32, end_key: u32) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}", start_key, end_key), - } - } - fn new_str(start_key: u32, end_key: u32, val: &str) -> Self { - MockItem { - start_key, - end_key, - val: format!("{}-{}: {}", start_key, end_key, val), - } - } - } - impl fmt::Display for MockItem { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.val) - } - } - #[rustfmt::skip] - fn assert_search( - tree: &IntervalTree, - key: u32, - expected: &[&str], - ) -> Option> { - if let Some(v) = tree.search(key) { - let vstr = v.to_string(); - - assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v); - assert!( - expected.contains(&vstr.as_str()), - "search with {} returned {}, expected one of: {:?}", - key, v, expected, - ); - - Some(v) - } else { - assert!( - expected.is_empty(), - "search with {} returned None, expected one of {:?}", - key, expected - ); - None - } - } - - fn assert_contents(tree: &IntervalTree, expected: &[&str]) { - let mut contents: Vec = tree.iter().map(|e| e.to_string()).collect(); - contents.sort(); - assert_eq!(contents, expected); - } - - fn dump_tree(tree: &IntervalTree) { - for (point_key, point) in tree.points.iter() { - print!("{}:", point_key); - for e in point.elements.iter() { - print!(" {}", e); - } - println!(); - } - } - - #[test] - fn test_interval_tree_simple() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Simple, non-overlapping ranges. - tree.insert(Arc::new(MockItem::new(10, 11))); - tree.insert(Arc::new(MockItem::new(11, 12))); - tree.insert(Arc::new(MockItem::new(12, 13))); - tree.insert(Arc::new(MockItem::new(18, 19))); - tree.insert(Arc::new(MockItem::new(17, 18))); - tree.insert(Arc::new(MockItem::new(15, 16))); - - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &["10-11"]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["12-13"]); - assert_search(&tree, 13, &["12-13"]); - assert_search(&tree, 14, &["12-13"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 16, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["18-19"]); - assert_search(&tree, 19, &["18-19"]); - assert_search(&tree, 20, &["18-19"]); - - // remove a few entries and search around them again - tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry - tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle - tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry - assert_search(&tree, 9, &[]); - assert_search(&tree, 10, &[]); - assert_search(&tree, 11, &["11-12"]); - assert_search(&tree, 12, &["11-12"]); - assert_search(&tree, 14, &["11-12"]); - assert_search(&tree, 15, &["15-16"]); - assert_search(&tree, 17, &["17-18"]); - assert_search(&tree, 18, &["17-18"]); - } - - #[test] - fn test_interval_tree_overlap() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Overlapping items - tree.insert(Arc::new(MockItem::new(22, 24))); - tree.insert(Arc::new(MockItem::new(23, 25))); - let x24_26 = Arc::new(MockItem::new(24, 26)); - tree.insert(Arc::clone(&x24_26)); - let x26_28 = Arc::new(MockItem::new(26, 28)); - tree.insert(Arc::clone(&x26_28)); - tree.insert(Arc::new(MockItem::new(25, 27))); - - assert_search(&tree, 22, &["22-24"]); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25", "24-26"]); - assert_search(&tree, 25, &["24-26", "25-27"]); - assert_search(&tree, 26, &["25-27", "26-28"]); - assert_search(&tree, 27, &["26-28"]); - assert_search(&tree, 28, &["26-28"]); - assert_search(&tree, 29, &["26-28"]); - - tree.remove(&x24_26); - tree.remove(&x26_28); - assert_search(&tree, 23, &["22-24", "23-25"]); - assert_search(&tree, 24, &["23-25"]); - assert_search(&tree, 25, &["25-27"]); - assert_search(&tree, 26, &["25-27"]); - assert_search(&tree, 27, &["25-27"]); - assert_search(&tree, 28, &["25-27"]); - assert_search(&tree, 29, &["25-27"]); - } - - #[test] - fn test_interval_tree_nested() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Items containing other items - tree.insert(Arc::new(MockItem::new(31, 39))); - tree.insert(Arc::new(MockItem::new(32, 34))); - tree.insert(Arc::new(MockItem::new(33, 35))); - tree.insert(Arc::new(MockItem::new(30, 40))); - - assert_search(&tree, 30, &["30-40"]); - assert_search(&tree, 31, &["30-40", "31-39"]); - assert_search(&tree, 32, &["30-40", "32-34", "31-39"]); - assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]); - assert_search(&tree, 34, &["30-40", "33-35", "31-39"]); - assert_search(&tree, 35, &["30-40", "31-39"]); - assert_search(&tree, 36, &["30-40", "31-39"]); - assert_search(&tree, 37, &["30-40", "31-39"]); - assert_search(&tree, 38, &["30-40", "31-39"]); - assert_search(&tree, 39, &["30-40"]); - assert_search(&tree, 40, &["30-40"]); - assert_search(&tree, 41, &["30-40"]); - } - - #[test] - fn test_interval_tree_duplicates() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Duplicate keys - let item_a = Arc::new(MockItem::new_str(55, 56, "a")); - tree.insert(Arc::clone(&item_a)); - let item_b = Arc::new(MockItem::new_str(55, 56, "b")); - tree.insert(Arc::clone(&item_b)); - let item_c = Arc::new(MockItem::new_str(55, 56, "c")); - tree.insert(Arc::clone(&item_c)); - let item_d = Arc::new(MockItem::new_str(54, 56, "d")); - tree.insert(Arc::clone(&item_d)); - let item_e = Arc::new(MockItem::new_str(55, 57, "e")); - tree.insert(Arc::clone(&item_e)); - - dump_tree(&tree); - - assert_search( - &tree, - 55, - &["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"], - ); - tree.remove(&item_b); - dump_tree(&tree); - - assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]); - - tree.remove(&item_d); - dump_tree(&tree); - assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]); - } - - #[test] - #[should_panic] - fn test_interval_tree_insert_twice() { - let mut tree: IntervalTree = IntervalTree::default(); - - // Inserting the same item twice is not cool - let item = Arc::new(MockItem::new(1, 2)); - tree.insert(Arc::clone(&item)); - tree.insert(Arc::clone(&item)); // fails assertion - } -} diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index fe82fd491c..1c8a4c6860 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -3,30 +3,26 @@ //! //! When the timeline is first accessed, the server lists of all layer files //! in the timelines/ directory, and populates this map with -//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL -//! is received, we create InMemoryLayers to hold the incoming records. Now and -//! then, in the checkpoint() function, the in-memory layers are frozen, forming -//! new image and delta layers and corresponding files are written to disk. +//! ImageLayer and DeltaLayer structs corresponding to each file. When the first +//! new WAL record is received, we create an InMemoryLayer to hold the incoming +//! records. Now and then, in the checkpoint() function, the in-memory layer is +//! are frozen, and it is split up into new image and delta layers and the +//! corresponding files are written to disk. //! -use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree}; -use crate::layered_repository::storage_layer::{Layer, SegmentTag}; +use crate::layered_repository::storage_layer::range_overlaps; +use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::InMemoryLayer; -use crate::relish::*; +use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; -use std::cmp::Ordering; -use std::collections::{BinaryHeap, HashMap}; +use std::ops::Range; use std::sync::Arc; +use tracing::*; use zenith_metrics::{register_int_gauge, IntGauge}; use zenith_utils::lsn::Lsn; -use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP}; - lazy_static! { - static ref NUM_INMEMORY_LAYERS: IntGauge = - register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory") - .expect("failed to define a metric"); static ref NUM_ONDISK_LAYERS: IntGauge = register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk") .expect("failed to define a metric"); @@ -37,108 +33,147 @@ lazy_static! { /// #[derive(Default)] pub struct LayerMap { - /// All the layers keyed by segment tag - segs: HashMap, + // + // 'open_layer' holds the current InMemoryLayer that is accepting new + // records. If it is None, 'next_open_layer_at' will be set instead, indicating + // where the start LSN of the next InMemoryLayer that is to be created. + // + pub open_layer: Option>, + pub next_open_layer_at: Option, - /// All in-memory layers, ordered by 'oldest_lsn' and generation - /// of each layer. This allows easy access to the in-memory layer that - /// contains the oldest WAL record. - open_layers: BinaryHeap, + /// + /// The frozen layer, if any, contains WAL older than the current 'open_layer' + /// or 'next_open_layer_at', but newer than any historic layer. The frozen + /// layer is during checkpointing, when an InMemoryLayer is being written out + /// to disk. + /// + pub frozen_layer: Option>, - /// Generation number, used to distinguish newly inserted entries in the - /// binary heap from older entries during checkpoint. - current_generation: u64, + /// All the historic layers are kept here + + /// TODO: This is a placeholder implementation of a data structure + /// to hold information about all the layer files on disk and in + /// S3. Currently, it's just a vector and all operations perform a + /// linear scan over it. That obviously becomes slow as the + /// number of layers grows. I'm imagining that an R-tree or some + /// other 2D data structure would be the long-term solution here. + historic_layers: Vec>, +} + +pub struct SearchResult { + pub layer: Arc, + pub lsn_floor: Lsn, } impl LayerMap { - /// - /// Look up a layer using the given segment tag and LSN. This differs from a - /// plain key-value lookup in that if there is any layer that covers the - /// given LSN, or precedes the given LSN, it is returned. In other words, - /// you don't need to know the exact start LSN of the layer. - /// - pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option> { - let segentry = self.segs.get(tag)?; + pub fn search(&self, key: Key, lsn: Lsn) -> Result> { + // linear search + // Find the latest image layer that covers the given key + let mut latest_img: Option> = None; + let mut latest_img_lsn: Option = None; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } + if !l.get_key_range().contains(&key) { + continue; + } + let img_lsn = l.get_lsn_range().start; - segentry.get(lsn) - } + if img_lsn > lsn { + // too new + continue; + } + if img_lsn == lsn { + // found exact match + return Ok(Some(SearchResult { + layer: Arc::clone(l), + lsn_floor: lsn, + })); + } + if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { + latest_img = Some(Arc::clone(l)); + latest_img_lsn = Some(img_lsn); + } + } - /// - /// Get the open layer for given segment for writing. Or None if no open - /// layer exists. - /// - pub fn get_open(&self, tag: &SegmentTag) -> Option> { - let segentry = self.segs.get(tag)?; - - segentry - .open_layer_id - .and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id)) - } - - /// - /// Insert an open in-memory layer - /// - pub fn insert_open(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - - let layer_id = segentry.update_open(Arc::clone(&layer)); - - let oldest_lsn = layer.get_oldest_lsn(); - - // After a crash and restart, 'oldest_lsn' of the oldest in-memory - // layer becomes the WAL streaming starting point, so it better not point - // in the middle of a WAL record. - assert!(oldest_lsn.is_aligned()); - - // Also add it to the binary heap - let open_layer_entry = OpenLayerEntry { - oldest_lsn: layer.get_oldest_lsn(), - layer_id, - generation: self.current_generation, - }; - self.open_layers.push(open_layer_entry); - - NUM_INMEMORY_LAYERS.inc(); - } - - /// Remove an open in-memory layer - pub fn remove_open(&mut self, layer_id: LayerId) { - // Note: we don't try to remove the entry from the binary heap. - // It will be removed lazily by peek_oldest_open() when it's made it to - // the top of the heap. - - let layer_opt = { - let mut global_map = GLOBAL_LAYER_MAP.write().unwrap(); - let layer_opt = global_map.get(&layer_id); - global_map.remove(&layer_id); - // TODO it's bad that a ref can still exist after being evicted from cache - layer_opt - }; - - if let Some(layer) = layer_opt { - let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap(); - - if segentry.open_layer_id == Some(layer_id) { - // Also remove it from the SegEntry of this segment - segentry.open_layer_id = None; - } else { - // We could have already updated segentry.open for - // dropped (non-writeable) layer. This is fine. - assert!(!layer.is_writeable()); - assert!(layer.is_dropped()); + // Search the delta layers + let mut latest_delta: Option> = None; + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if !l.get_key_range().contains(&key) { + continue; } - NUM_INMEMORY_LAYERS.dec(); + if l.get_lsn_range().start > lsn { + // too new + continue; + } + + if l.get_lsn_range().end > lsn { + // this layer contains the requested point in the key/lsn space. + // No need to search any further + info!( + "found layer {} for request on {} at {}", + l.filename().display(), + key, + lsn + ); + latest_delta.replace(Arc::clone(l)); + break; + } + // this layer's end LSN is smaller than the requested point. If there's + // nothing newer, this is what we need to return. Remember this. + if let Some(ref old_candidate) = latest_delta { + if l.get_lsn_range().end > old_candidate.get_lsn_range().end { + latest_delta.replace(Arc::clone(l)); + } + } else { + latest_delta.replace(Arc::clone(l)); + } } + if let Some(l) = latest_delta { + info!( + "found (old) layer {} for request on {} at {}", + l.filename().display(), + key, + lsn + ); + Ok(Some(SearchResult { + lsn_floor: latest_img_lsn.unwrap_or(l.get_lsn_range().start), + layer: l, + })) + } else if let Some(l) = latest_img { + info!("found img layer and no deltas for request on {} at {}", key, lsn); + Ok(Some(SearchResult { + lsn_floor: latest_img_lsn.unwrap(), + layer: l, + })) + } else { + info!("no layer found for request on {} at {}", key, lsn); + Ok(None) + } + } + + pub fn image_exists(&self, key_range: &Range, lsn: Lsn) -> bool { + for l in self.historic_layers.iter() { + if !l.is_incremental() + && l.get_key_range() == *key_range + && l.get_lsn_range().start == lsn + { + return true; + } + } + false } /// /// Insert an on-disk layer /// pub fn insert_historic(&mut self, layer: Arc) { - let segentry = self.segs.entry(layer.get_seg_tag()).or_default(); - segentry.insert_historic(layer); - + self.historic_layers.push(layer); NUM_ONDISK_LAYERS.inc(); } @@ -147,61 +182,63 @@ impl LayerMap { /// /// This should be called when the corresponding file on disk has been deleted. /// + #[allow(dead_code)] pub fn remove_historic(&mut self, layer: Arc) { - let tag = layer.get_seg_tag(); + let len_before = self.historic_layers.len(); - if let Some(segentry) = self.segs.get_mut(&tag) { - segentry.historic.remove(&layer); - } + // FIXME: ptr_eq might fail to return true for 'dyn' + // references. Clippy complains about this. In practice it + // seems to work, the assertion below would be triggered + // otherwise but this ought to be fixed. + #[allow(clippy::vtable_address_comparisons)] + self.historic_layers + .retain(|other| !Arc::ptr_eq(other, &layer)); + + assert_eq!(self.historic_layers.len(), len_before - 1); NUM_ONDISK_LAYERS.dec(); } - // List relations along with a flag that marks if they exist at the given lsn. - // spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases. - // Pass Tag if we're only interested in some relations. - pub fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result> { - let mut rels: HashMap = HashMap::new(); - - for (seg, segentry) in self.segs.iter() { - match seg.rel { - RelishTag::Relation(reltag) => { - if let Some(request_rel) = tag { - if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode) - && (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode) - { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - _ => { - if tag == None { - if let Some(exists) = segentry.exists_at_lsn(lsn)? { - rels.insert(seg.rel, exists); - } - } - } - } - } - Ok(rels) - } - /// Is there a newer image layer for given segment? /// /// This is used for garbage collection, to determine if an old layer can /// be deleted. /// We ignore segments newer than disk_consistent_lsn because they will be removed at restart + /// We also only look at historic layers + //#[allow(dead_code)] pub fn newer_image_layer_exists( &self, - seg: SegmentTag, + key_range: &Range, lsn: Lsn, disk_consistent_lsn: Lsn, - ) -> bool { - if let Some(segentry) = self.segs.get(&seg) { - segentry.newer_image_layer_exists(lsn, disk_consistent_lsn) - } else { - false + ) -> Result { + + let mut range_remain = key_range.clone(); + + loop { + let mut made_progress = false; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } + let img_lsn = l.get_lsn_range().start; + if !l.is_incremental() && + l.get_key_range().contains(&range_remain.start) && + img_lsn > lsn && + img_lsn < disk_consistent_lsn + { + made_progress = true; + let img_key_end = l.get_key_range().end; + + if img_key_end >= range_remain.end { + return Ok(true); + } + range_remain.start = img_key_end; + } + } + + if !made_progress { + return Ok(false); + } } } @@ -211,284 +248,139 @@ impl LayerMap { /// used for garbage collection, to determine if some alive layer /// exists at the lsn. If so, we shouldn't delete a newer dropped layer /// to avoid incorrectly making it visible. - pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { - Ok(if let Some(segentry) = self.segs.get(&seg) { - segentry.exists_at_lsn(lsn)?.unwrap_or(false) - } else { - false - }) + /* + pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result { + Ok(if let Some(segentry) = self.historic_layers.get(&seg) { + segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false) + } else { + false + }) + } + */ + + pub fn iter_historic_layers(&self) -> std::slice::Iter> { + self.historic_layers.iter() } - /// Return the oldest in-memory layer, along with its generation number. - pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc, u64)> { - let global_map = GLOBAL_LAYER_MAP.read().unwrap(); + fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option> { + // Find the last image layer that covers the key + let mut candidate_lsn = Lsn(0); + let mut candidate = None; + for l in self.historic_layers.iter() { + if l.is_incremental() { + continue; + } - while let Some(oldest_entry) = self.open_layers.peek() { - if let Some(layer) = global_map.get(&oldest_entry.layer_id) { - return Some((oldest_entry.layer_id, layer, oldest_entry.generation)); - } else { - self.open_layers.pop(); + if !l.get_key_range().contains(&key) { + continue; + } + + let this_lsn = l.get_lsn_range().start; + if this_lsn > lsn { + continue; + } + if this_lsn < candidate_lsn { + // our previous candidate was better + continue; + } + candidate_lsn = this_lsn; + candidate = Some(Arc::clone(l)); + } + + candidate + } + + /// + /// Divide the whole given range of keys into sub-ranges based on the latest + /// image layer that covers each range. (This is used when creating new + /// image layers) + /// + // FIXME: clippy complains that the result type is very complex. She's probably + // right... + #[allow(clippy::type_complexity)] + pub fn image_coverage( + &self, + key_range: &Range, + lsn: Lsn, + ) -> Result, Option>)>> { + let mut points: Vec; + + points = vec![key_range.start]; + for l in self.historic_layers.iter() { + if l.get_lsn_range().start > lsn { + continue; + } + let range = l.get_key_range(); + if key_range.contains(&range.start) { + points.push(l.get_key_range().start); + } + if key_range.contains(&range.end) { + points.push(l.get_key_range().end); } } - None - } + points.push(key_range.end); - /// Increment the generation number used to stamp open in-memory layers. Layers - /// added with `insert_open` after this call will be associated with the new - /// generation. Returns the new generation number. - pub fn increment_generation(&mut self) -> u64 { - self.current_generation += 1; - self.current_generation - } + points.sort(); + points.dedup(); - pub fn iter_historic_layers(&self) -> HistoricLayerIter { - HistoricLayerIter { - seg_iter: self.segs.iter(), - iter: None, + // Ok, we now have a list of "interesting" points in the key space + + // For each range between the points, find the latest image + let mut start = *points.first().unwrap(); + let mut ranges = Vec::new(); + for end in points[1..].iter() { + let img = self.find_latest_image(start, lsn); + + ranges.push((start..*end, img)); + + start = *end; } + Ok(ranges) + } + + pub fn get_deltas( + &self, + key_range: &Range, + lsn_range: &Range, + ) -> Result>> { + let mut deltas = Vec::new(); + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if !range_overlaps(&l.get_lsn_range(), lsn_range) { + continue; + } + if !range_overlaps(&l.get_key_range(), key_range) { + continue; + } + deltas.push(Arc::clone(l)); + } + Ok(deltas) + } + + pub fn get_level0_deltas(&self) -> Result>> { + let mut deltas = Vec::new(); + for l in self.historic_layers.iter() { + if !l.is_incremental() { + continue; + } + if l.get_key_range() != (Key::MIN..Key::MAX) { + continue; + } + deltas.push(Arc::clone(l)); + } + Ok(deltas) } /// debugging function to print out the contents of the layer map #[allow(unused)] pub fn dump(&self) -> Result<()> { println!("Begin dump LayerMap"); - for (seg, segentry) in self.segs.iter() { - if let Some(open) = &segentry.open_layer_id { - if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) { - layer.dump()?; - } else { - println!("layer not found in global map"); - } - } - - for layer in segentry.historic.iter() { - layer.dump()?; - } + for layer in self.historic_layers.iter() { + layer.dump()?; } println!("End dump LayerMap"); Ok(()) } } - -impl IntervalItem for dyn Layer { - type Key = Lsn; - - fn start_key(&self) -> Lsn { - self.get_start_lsn() - } - fn end_key(&self) -> Lsn { - self.get_end_lsn() - } -} - -/// -/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers -/// associated with the segment. -/// -/// The last layer that is open for writes is always an InMemoryLayer, -/// and is kept in a separate field, because there can be only one for -/// each segment. The older layers, stored on disk, are kept in an -/// IntervalTree. -#[derive(Default)] -struct SegEntry { - open_layer_id: Option, - historic: IntervalTree, -} - -impl SegEntry { - /// Does the segment exist at given LSN? - /// Return None if object is not found in this SegEntry. - fn exists_at_lsn(&self, lsn: Lsn) -> Result> { - if let Some(layer) = self.get(lsn) { - Ok(Some(layer.get_seg_exists(lsn)?)) - } else { - Ok(None) - } - } - - pub fn get(&self, lsn: Lsn) -> Option> { - if let Some(open_layer_id) = &self.open_layer_id { - let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?; - if open_layer.get_start_lsn() <= lsn { - return Some(open_layer); - } - } - - self.historic.search(lsn) - } - - pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool { - // We only check on-disk layers, because - // in-memory layers are not durable - - // The end-LSN is exclusive, while disk_consistent_lsn is - // inclusive. For example, if disk_consistent_lsn is 100, it is - // OK for a delta layer to have end LSN 101, but if the end LSN - // is 102, then it might not have been fully flushed to disk - // before crash. - self.historic - .iter_newer(lsn) - .any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1) - } - - // Set new open layer for a SegEntry. - // It's ok to rewrite previous open layer, - // but only if it is not writeable anymore. - pub fn update_open(&mut self, layer: Arc) -> LayerId { - if let Some(prev_open_layer_id) = &self.open_layer_id { - if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id) - { - assert!(!prev_open_layer.is_writeable()); - } - } - let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer); - self.open_layer_id = Some(open_layer_id); - open_layer_id - } - - pub fn insert_historic(&mut self, layer: Arc) { - self.historic.insert(layer); - } -} - -/// Entry held in LayerMap::open_layers, with boilerplate comparison routines -/// to implement a min-heap ordered by 'oldest_lsn' and 'generation' -/// -/// The generation number associated with each entry can be used to distinguish -/// recently-added entries (i.e after last call to increment_generation()) from older -/// entries with the same 'oldest_lsn'. -struct OpenLayerEntry { - oldest_lsn: Lsn, // copy of layer.get_oldest_lsn() - generation: u64, - layer_id: LayerId, -} -impl Ord for OpenLayerEntry { - fn cmp(&self, other: &Self) -> Ordering { - // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here - // to get that. Entries with identical oldest_lsn are ordered by generation - other - .oldest_lsn - .cmp(&self.oldest_lsn) - .then_with(|| other.generation.cmp(&self.generation)) - } -} -impl PartialOrd for OpenLayerEntry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} -impl PartialEq for OpenLayerEntry { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} -impl Eq for OpenLayerEntry {} - -/// Iterator returned by LayerMap::iter_historic_layers() -pub struct HistoricLayerIter<'a> { - seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>, - iter: Option>, -} - -impl<'a> Iterator for HistoricLayerIter<'a> { - type Item = Arc; - - fn next(&mut self) -> std::option::Option<::Item> { - loop { - if let Some(x) = &mut self.iter { - if let Some(x) = x.next() { - return Some(Arc::clone(&x)); - } - } - if let Some((_tag, segentry)) = self.seg_iter.next() { - self.iter = Some(segentry.historic.iter()); - continue; - } else { - return None; - } - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::config::PageServerConf; - use std::str::FromStr; - use zenith_utils::zid::{ZTenantId, ZTimelineId}; - - /// Arbitrary relation tag, for testing. - const TESTREL_A: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }); - - lazy_static! { - static ref DUMMY_TIMELINEID: ZTimelineId = - ZTimelineId::from_str("00000000000000000000000000000000").unwrap(); - static ref DUMMY_TENANTID: ZTenantId = - ZTenantId::from_str("00000000000000000000000000000000").unwrap(); - } - - /// Construct a dummy InMemoryLayer for testing - fn dummy_inmem_layer( - conf: &'static PageServerConf, - segno: u32, - start_lsn: Lsn, - oldest_lsn: Lsn, - ) -> Arc { - Arc::new( - InMemoryLayer::create( - conf, - *DUMMY_TIMELINEID, - *DUMMY_TENANTID, - SegmentTag { - rel: TESTREL_A, - segno, - }, - start_lsn, - oldest_lsn, - ) - .unwrap(), - ) - } - - #[test] - fn test_open_layers() -> Result<()> { - let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer")); - let conf = Box::leak(Box::new(conf)); - std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?; - - let mut layers = LayerMap::default(); - - let gen1 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100))); - layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200))); - layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120))); - layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110))); - - let gen2 = layers.increment_generation(); - layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110))); - layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100))); - - // A helper function (closure) to pop the next oldest open entry from the layer map, - // and assert that it is what we'd expect - let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| { - let (layer_id, l, generation) = layers.peek_oldest_open().unwrap(); - assert!(l.get_seg_tag().segno == expected_segno); - assert!(generation == expected_generation); - layers.remove_open(layer_id); - }; - - assert_pop_layer(0, gen1); // 0x100 - assert_pop_layer(5, gen2); // 0x100 - assert_pop_layer(3, gen1); // 0x110 - assert_pop_layer(4, gen2); // 0x110 - assert_pop_layer(2, gen1); // 0x120 - assert_pop_layer(1, gen1); // 0x200 - - Ok(()) - } -} diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 8976491fc0..eb31a9c415 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -2,75 +2,34 @@ //! Common traits and structs for layers //! -use crate::relish::RelishTag; -use crate::repository::{BlockNumber, ZenithWalRecord}; +use crate::repository::{Key, Value}; +use crate::walrecord::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; -use serde::{Deserialize, Serialize}; -use std::fmt; +use std::collections::HashSet; +use std::ops::Range; use std::path::PathBuf; use zenith_utils::lsn::Lsn; -// Size of one segment in pages (10 MB) -pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; +// in # of key-value pairs +// FIXME Size of one segment in pages (128 MB) +pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; +pub const TARGET_FILE_SIZE: u32 = (TARGET_FILE_SIZE_BYTES / 8192) as u32; -/// -/// Each relish stored in the repository is divided into fixed-sized "segments", -/// with 10 MB of key-space, or 1280 8k pages each. -/// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] -pub struct SegmentTag { - pub rel: RelishTag, - pub segno: u32, -} - -/// SegmentBlk represents a block number within a segment, or the size of segment. -/// -/// This is separate from BlockNumber, which is used for block number within the -/// whole relish. Since this is just a type alias, the compiler will let you mix -/// them freely, but we use the type alias as documentation to make it clear -/// which one we're dealing with. -/// -/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally -/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes -/// operations more verbose). -pub type SegmentBlk = u32; - -impl fmt::Display for SegmentTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}.{}", self.rel, self.segno) +pub fn range_overlaps(a: &Range, b: &Range) -> bool +where + T: PartialOrd, +{ + if a.start < b.start { + a.end > b.start + } else { + b.end > a.start } } -impl SegmentTag { - /// Given a relish and block number, calculate the corresponding segment and - /// block number within the segment. - pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) { - ( - SegmentTag { - rel, - segno: blknum / RELISH_SEG_SIZE, - }, - blknum % RELISH_SEG_SIZE, - ) - } -} - -/// -/// Represents a version of a page at a specific LSN. The LSN is the key of the -/// entry in the 'page_versions' hash, it is not duplicated here. -/// -/// A page version can be stored as a full page image, or as WAL record that needs -/// to be applied over the previous page version to reconstruct this version. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum PageVersion { - Page(Bytes), - Wal(ZenithWalRecord), -} - -/// +/// FIXME /// Struct used to communicate across calls to 'get_page_reconstruct_data'. /// /// Before first call to get_page_reconstruct_data, you can fill in 'page_img' @@ -88,25 +47,32 @@ pub enum PageVersion { /// the same PageReconstructData struct in the next 'get_page_reconstruct_data' /// call, to collect more records. /// -pub struct PageReconstructData { +#[derive(Debug)] +pub struct ValueReconstructState { + pub key: Key, + pub lsn: Lsn, pub records: Vec<(Lsn, ZenithWalRecord)>, - pub page_img: Option<(Lsn, Bytes)>, + pub img: Option<(Lsn, Bytes)>, + + pub request_lsn: Lsn, // original request's LSN, for debugging purposes } /// Return value from Layer::get_page_reconstruct_data -pub enum PageReconstructResult { +#[derive(Debug)] +pub enum ValueReconstructResult { /// Got all the data needed to reconstruct the requested page Complete, /// This layer didn't contain all the required data, the caller should look up /// the predecessor layer at the returned LSN and collect more data from there. - Continue(Lsn), + Continue, + /// This layer didn't contain data needed to reconstruct the page version at /// the returned LSN. This is usually considered an error, but might be OK /// in some circumstances. - Missing(Lsn), + Missing, } -/// +/// FIXME /// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs. /// There are two kinds of layers, in-memory and on-disk layers. In-memory /// layers are used to ingest incoming WAL, and provide fast access @@ -120,21 +86,17 @@ pub trait Layer: Send + Sync { /// Identify the timeline this relish belongs to fn get_timeline_id(&self) -> ZTimelineId; - /// Identify the relish segment - fn get_seg_tag(&self) -> SegmentTag; + /// Range of segments that this layer covers + fn get_key_range(&self) -> Range; + /// FIXME /// Inclusive start bound of the LSN range that this layer holds - fn get_start_lsn(&self) -> Lsn; - /// Exclusive end bound of the LSN range that this layer holds. /// /// - For an open in-memory layer, this is MAX_LSN. /// - For a frozen in-memory layer or a delta layer, this is a valid end bound. /// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1 - fn get_end_lsn(&self) -> Lsn; - - /// Is the segment represented by this layer dropped by PostgreSQL? - fn is_dropped(&self) -> bool; + fn get_lsn_range(&self) -> Range; /// Filename used to store this layer on disk. (Even in-memory layers /// implement this, to print a handy unique identifier for the layer for @@ -153,18 +115,11 @@ pub trait Layer: Send + Sync { /// is available. If this returns PageReconstructResult::Continue, look up /// the predecessor layer and call again with the same 'reconstruct_data' to /// collect more data. - fn get_page_reconstruct_data( + fn get_value_reconstruct_data( &self, - blknum: SegmentBlk, - lsn: Lsn, - reconstruct_data: &mut PageReconstructData, - ) -> Result; - - /// Return size of the segment at given LSN. (Only for blocky relations.) - fn get_seg_size(&self, lsn: Lsn) -> Result; - - /// Does the segment exist at given LSN? Or was it dropped before it. - fn get_seg_exists(&self, lsn: Lsn) -> Result; + lsn_floor: Lsn, + reconstruct_data: &mut ValueReconstructState, + ) -> Result; /// Does this layer only contain some data for the segment (incremental), /// or does it contain a version of every page? This is important to know @@ -175,6 +130,11 @@ pub trait Layer: Send + Sync { /// Returns true for layers that are represented in memory. fn is_in_memory(&self) -> bool; + fn iter(&self) -> Box> + '_>; + + /// Return a set of all distinct Keys present in this layer + fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()>; + /// Release memory used by this layer. There is no corresponding 'load' /// function, that's done implicitly when you call one of the get-functions. fn unload(&self) -> Result<()>; diff --git a/pageserver/src/layered_repository/utils.rs b/pageserver/src/layered_repository/utils.rs new file mode 100644 index 0000000000..de6303ce35 --- /dev/null +++ b/pageserver/src/layered_repository/utils.rs @@ -0,0 +1,48 @@ +// Utilities for reading and writing Values +use std::io::{Error, Write}; +use std::os::unix::fs::FileExt; + +use bookfile::BoundedReader; + +pub fn read_blob(file: &F, off: u64) -> Result, Error> { + // read length + let mut len_buf = [0u8; 4]; + file.read_exact_at(&mut len_buf, off)?; + + let len = u32::from_ne_bytes(len_buf); + + let mut buf: Vec = Vec::new(); + buf.resize(len as usize, 0); + file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?; + + Ok(buf) +} + +pub fn read_blob_from_chapter( + file: &BoundedReader<&F>, + off: u64, +) -> Result, Error> { + // read length + let mut len_buf = [0u8; 4]; + file.read_exact_at(&mut len_buf, off)?; + + let len = u32::from_ne_bytes(len_buf); + + let mut buf: Vec = Vec::new(); + buf.resize(len as usize, 0); + file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?; + + Ok(buf) +} + +pub fn write_blob(writer: &mut W, buf: &[u8]) -> Result { + let val_len = buf.len() as u32; + + // write the 'length' field and kind byte. + let lenbuf = u32::to_ne_bytes(val_len); + + writer.write_all(&lenbuf)?; + writer.write_all(buf)?; + + Ok(4 + val_len as u64) +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index a8a878c448..af4548ebe9 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -6,6 +6,7 @@ pub mod import_datadir; pub mod layered_repository; pub mod page_cache; pub mod page_service; +pub mod pgdatadir_mapping; pub mod relish; pub mod remote_storage; pub mod repository; @@ -22,7 +23,8 @@ use lazy_static::lazy_static; use zenith_metrics::{register_int_gauge_vec, IntGaugeVec}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; -use layered_repository::{LayeredRepository, LayeredTimeline}; +use layered_repository::LayeredRepository; +use pgdatadir_mapping::DatadirTimeline; lazy_static! { static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!( @@ -47,5 +49,5 @@ pub enum CheckpointConfig { } pub type RepositoryImpl = LayeredRepository; -pub type TimelineImpl = LayeredTimeline; +pub type DatadirTimelineImpl = DatadirTimeline; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bf20cfb0db..94398becff 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -32,8 +32,10 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId}; use crate::basebackup; use crate::config::PageServerConf; +use crate::pgdatadir_mapping::DatadirTimeline; use crate::relish::*; -use crate::repository::{Repository, Timeline}; +use crate::repository::Repository; +use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -395,8 +397,8 @@ impl PageServerHandler { /// In either case, if the page server hasn't received the WAL up to the /// requested LSN yet, we will wait for it to arrive. The return value is /// the LSN that should be used to look up the page versions. - fn wait_or_get_last_lsn( - timeline: &T, + fn wait_or_get_last_lsn( + timeline: &DatadirTimeline, mut lsn: Lsn, latest: bool, latest_gc_cutoff_lsn: &RwLockReadGuard, @@ -423,7 +425,7 @@ impl PageServerHandler { if lsn <= last_record_lsn { lsn = last_record_lsn; } else { - timeline.wait_lsn(lsn)?; + timeline.tline.wait_lsn(lsn)?; // Since we waited for 'lsn' to arrive, that is now the last // record LSN. (Or close enough for our purposes; the // last-record LSN can advance immediately after we return @@ -433,7 +435,7 @@ impl PageServerHandler { if lsn == Lsn(0) { bail!("invalid LSN(0) in request"); } - timeline.wait_lsn(lsn)?; + timeline.tline.wait_lsn(lsn)?; } ensure!( lsn >= **latest_gc_cutoff_lsn, @@ -443,54 +445,47 @@ impl PageServerHandler { Ok(lsn) } - fn handle_get_rel_exists_request( + fn handle_get_rel_exists_request( &self, - timeline: &T, + timeline: &DatadirTimeline, req: &PagestreamExistsRequest, ) -> Result { let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let exists = timeline.get_rel_exists(tag, lsn)?; + let exists = timeline.get_rel_exists(req.rel, lsn)?; Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse { exists, })) } - fn handle_get_nblocks_request( + fn handle_get_nblocks_request( &self, - timeline: &T, + timeline: &DatadirTimeline, req: &PagestreamNblocksRequest, ) -> Result { let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; - let n_blocks = timeline.get_relish_size(tag, lsn)?; - - // Return 0 if relation is not found. - // This is what postgres smgr expects. - let n_blocks = n_blocks.unwrap_or(0); + let n_blocks = timeline.get_rel_size(req.rel, lsn)?; Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse { n_blocks, })) } - fn handle_get_page_at_lsn_request( + fn handle_get_page_at_lsn_request( &self, - timeline: &T, + timeline: &DatadirTimeline, req: &PagestreamGetPageRequest, ) -> Result { let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn) .entered(); - let tag = RelishTag::Relation(req.rel); - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?; /* // Add a 1s delay to some requests. The delayed causes the requests to @@ -500,7 +495,7 @@ impl PageServerHandler { std::thread::sleep(std::time::Duration::from_millis(1000)); } */ - let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?; + let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?; Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page, @@ -520,7 +515,7 @@ impl PageServerHandler { // check that the timeline exists let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) .context("Cannot handle basebackup request for a remote timeline")?; - let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn(); + let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn(); if let Some(lsn) = lsn { timeline .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn) @@ -699,67 +694,19 @@ impl postgres_backend::Handler for PageServerHandler { let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?; pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::int8_col(b"layer_relfiles_total"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_relfiles_not_updated"), - RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_relfiles_removed"), - RowDescriptor::int8_col(b"layer_relfiles_dropped"), - RowDescriptor::int8_col(b"layer_nonrelfiles_total"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"), - RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"), - RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"), - RowDescriptor::int8_col(b"layer_nonrelfiles_removed"), - RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"), + RowDescriptor::int8_col(b"layers_total"), + RowDescriptor::int8_col(b"layers_needed_by_cutoff"), + RowDescriptor::int8_col(b"layers_needed_by_branches"), + RowDescriptor::int8_col(b"layers_not_updated"), + RowDescriptor::int8_col(b"layers_removed"), RowDescriptor::int8_col(b"elapsed"), ]))? .write_message_noflush(&BeMessage::DataRow(&[ - Some(result.ondisk_relfiles_total.to_string().as_bytes()), - Some( - result - .ondisk_relfiles_needed_by_cutoff - .to_string() - .as_bytes(), - ), - Some( - result - .ondisk_relfiles_needed_by_branches - .to_string() - .as_bytes(), - ), - Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_relfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_relfiles_removed.to_string().as_bytes()), - Some(result.ondisk_relfiles_dropped.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_by_cutoff - .to_string() - .as_bytes(), - ), - Some( - result - .ondisk_nonrelfiles_needed_by_branches - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()), - Some( - result - .ondisk_nonrelfiles_needed_as_tombstone - .to_string() - .as_bytes(), - ), - Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()), - Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()), + Some(result.layers_total.to_string().as_bytes()), + Some(result.layers_needed_by_cutoff.to_string().as_bytes()), + Some(result.layers_needed_by_branches.to_string().as_bytes()), + Some(result.layers_not_updated.to_string().as_bytes()), + Some(result.layers_removed.to_string().as_bytes()), Some(result.elapsed.as_millis().to_string().as_bytes()), ]))? .write_message(&BeMessage::CommandComplete(b"SELECT 1"))?; @@ -779,7 +726,7 @@ impl postgres_backend::Handler for PageServerHandler { let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid) .context("Failed to fetch local timeline for checkpoint request")?; - timeline.checkpoint(CheckpointConfig::Forced)?; + timeline.tline.checkpoint(CheckpointConfig::Forced)?; pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs new file mode 100644 index 0000000000..829533d65f --- /dev/null +++ b/pageserver/src/pgdatadir_mapping.rs @@ -0,0 +1,1173 @@ +//! +//! This provides an abstraction to store PostgreSQL relations and other files +//! in the key-value store +//! +//! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as +//! walingest.rs handles a few things like implicit relation creation and extension. +//! Clarify that) +//! + +use crate::relish::*; +use crate::repository::*; +use crate::repository::{Repository, Timeline}; +use crate::walrecord::ZenithWalRecord; +use anyhow::{bail, Result}; +use bytes::{Buf, Bytes}; +use postgres_ffi::{pg_constants, Oid, TransactionId}; +use serde::{Deserialize, Serialize}; +use std::collections::{HashMap, HashSet}; +use std::ops::Range; +use std::sync::{Arc, RwLockReadGuard}; +use tracing::{debug, info, warn}; +use zenith_utils::bin_ser::BeSer; +use zenith_utils::lsn::{Lsn, RecordLsn}; + +/// Block number within a relation or SRU. This matches PostgreSQL's BlockNumber type. +pub type BlockNumber = u32; + +pub struct DatadirTimeline +where + R: Repository, +{ + pub tline: Arc, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct DbDirectory { + // (dbnode, spcnode) + dbs: HashSet<(Oid, Oid)>, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct TwoPhaseDirectory { + xids: HashSet, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + rels: HashSet<(Oid, u8)>, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct RelSizeEntry { + nblocks: u32, +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct SlruSegmentDirectory { + // Set of SLRU segments that exist. + segments: HashSet, +} + +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); + +impl DatadirTimeline { + pub fn new(tline: Arc) -> Self { + DatadirTimeline { tline } + } + + //------------------------------------------------------------------------------ + // Public GET functions + //------------------------------------------------------------------------------ + + /// Look up given page version. + pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result { + let nblocks = self.get_rel_size(tag, lsn)?; + if blknum >= nblocks { + debug!( + "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", + tag, blknum, lsn, nblocks + ); + return Ok(ZERO_PAGE.clone()); + } + + let key = rel_block_to_key(tag, blknum); + self.tline.get(key, lsn) + } + + /// Look up given page version. + pub fn get_slru_page_at_lsn( + &self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + lsn: Lsn, + ) -> Result { + let key = slru_block_to_key(kind, segno, blknum); + self.tline.get(key, lsn) + } + + /// Get size of a relation file + pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { + if (tag.forknum == pg_constants::FSM_FORKNUM + || tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM) + && !self.get_rel_exists(tag, lsn)? + { + // FIXME: Postgres sometimes calls calls smgrcreate() to + // create FSM, and smgrnblocks() on it immediately + // afterwards, without extending it. Tolerate that by + // claiming that any non-existent FSM fork has size 0. + return Ok(0); + } + + let key = rel_size_to_key(tag); + let mut buf = self.tline.get(key, lsn)?; + Ok(buf.get_u32_le()) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_size( + &self, + kind: SlruKind, + segno: u32, + lsn: Lsn, + ) -> Result { + let key = slru_segment_size_to_key(kind, segno); + let mut buf = self.tline.get(key, lsn)?; + Ok(buf.get_u32_le()) + } + + /// Get size of an SLRU segment + pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result { + // fetch directory listing + let key = slru_dir_to_key(kind); + let buf = self.tline.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + let exists = dir.segments.get(&segno).is_some(); + Ok(exists) + } + + /// Does relation exist? + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + // fetch directory listing + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + + info!("EXISTS: {} : {:?}", tag, exists); + + Ok(exists) + } + + /// Get a list of all existing relations in given tablespace and database. + pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result> { + // fetch directory listing + let key = rel_dir_to_key(spcnode, dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); + + Ok(rels) + } + + /// Get a list of SLRU segments + pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { + // fetch directory entry + let key = slru_dir_to_key(kind); + + let buf = self.tline.get(key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + + Ok(dir.segments) + } + + pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result { + let key = relmap_file_key(spcnode, dbnode); + + let buf = self.tline.get(key, lsn)?; + Ok(buf) + } + + pub fn list_relmap_files(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dir = DbDirectory::des(&buf)?; + + Ok(dir.dbs) + } + + pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { + let key = twophase_file_key(xid); + let buf = self.tline.get(key, lsn)?; + Ok(buf) + } + + pub fn list_twophase_files(&self, lsn: Lsn) -> Result> { + // fetch directory entry + let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let dir = TwoPhaseDirectory::des(&buf)?; + + Ok(dir.xids) + } + + pub fn get_control_file(&self, lsn: Lsn) -> Result { + self.tline.get(CONTROLFILE_KEY, lsn) + } + + pub fn get_checkpoint(&self, lsn: Lsn) -> Result { + self.tline.get(CHECKPOINT_KEY, lsn) + } + + //------------------------------------------------------------------------------ + // Public PUT functions, to update the repository with new page versions. + // + // These are called by the WAL receiver to digest WAL records. + //------------------------------------------------------------------------------ + + /// Atomically get both last and prev. + pub fn get_last_record_rlsn(&self) -> RecordLsn { + self.tline.get_last_record_rlsn() + } + + /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. + pub fn get_last_record_lsn(&self) -> Lsn { + self.tline.get_last_record_lsn() + } + + pub fn get_prev_record_lsn(&self) -> Lsn { + self.tline.get_prev_record_lsn() + } + + pub fn get_disk_consistent_lsn(&self) -> Lsn { + self.tline.get_disk_consistent_lsn() + } + + /// This provides a "transaction-like" interface to updating the data + /// + /// To ingest a WAL record, call begin_record(lsn) to get a writer + /// object. Use the functions in the writer-object to modify the + /// repository state, updating all the pages and metadata that the + /// WAL record affects. When you're done, call writer.finish() to + /// commit the changes. + /// + /// Note that any pending modifications you make through the writer + /// won't be visible to calls to the get functions until you finish! + /// If you update the same page twice, the last update wins. + /// + pub fn begin_record(&self, lsn: Lsn) -> DatadirTimelineWriter { + DatadirTimelineWriter { + tline: self, + lsn, + pending_updates: HashMap::new(), + pending_deletions: Vec::new(), + } + } + + /// + /// Check that it is valid to request operations with that lsn. + pub fn check_lsn_is_in_scope( + &self, + lsn: Lsn, + latest_gc_cutoff_lsn: &RwLockReadGuard, + ) -> Result<()> { + self.tline.check_lsn_is_in_scope(lsn, latest_gc_cutoff_lsn) + } + + /// Retrieve current logical size of the timeline + /// + /// NOTE: counted incrementally, includes ancestors, + /// doesnt support TwoPhase relishes yet + pub fn get_current_logical_size(&self) -> usize { + //todo!() + 0 + } + + /// Does the same as get_current_logical_size but counted on demand. + /// Used in tests to ensure that incremental and non incremental variants match. + pub fn get_current_logical_size_non_incremental(&self, _lsn: Lsn) -> Result { + //todo!() + Ok(0) + } +} + +pub struct DatadirTimelineWriter<'a, R: Repository> { + tline: &'a DatadirTimeline, + + lsn: Lsn, + pending_updates: HashMap, + pending_deletions: Vec>, +} + +// TODO Currently, Deref is used to allow easy access to read methods from this trait. +// This is probably considered a bad practice in Rust and should be fixed eventually, +// but will cause large code changes. +impl<'a, R: Repository> std::ops::Deref for DatadirTimelineWriter<'a, R> { + type Target = DatadirTimeline; + + fn deref(&self) -> &Self::Target { + self.tline + } +} + +/// Various functions to mutate the repository state. +impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { + pub fn init_empty(&mut self) -> Result<()> { + let buf = DbDirectory::ser(&DbDirectory { + dbs: HashSet::new(), + })?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory { + xids: HashSet::new(), + })?; + self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); + + let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory { + segments: HashSet::new(), + })? + .into(); + self.put(slru_dir_to_key(SlruKind::Clog), Value::Image(buf.clone())); + self.put( + slru_dir_to_key(SlruKind::MultiXactMembers), + Value::Image(buf.clone()), + ); + self.put( + slru_dir_to_key(SlruKind::MultiXactOffsets), + Value::Image(buf), + ); + + Ok(()) + } + + /// Put a new page version that can be constructed from a WAL record + /// + /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the + /// current end-of-file. It's up to the caller to check that the relation size + /// matches the blocks inserted! + pub fn put_rel_wal_record( + &mut self, + rel: RelTag, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); + Ok(()) + } + + pub fn put_slru_wal_record( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + self.put( + slru_block_to_key(kind, segno, blknum), + Value::WalRecord(rec), + ); + Ok(()) + } + + /// Like put_wal_record, but with ready-made image of the page. + pub fn put_rel_page_image( + &mut self, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.put(rel_block_to_key(rel, blknum), Value::Image(img)); + Ok(()) + } + + pub fn put_slru_page_image( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.put(slru_block_to_key(kind, segno, blknum), Value::Image(img)); + Ok(()) + } + + pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { + // Add it to the directory (if it doesn't exist already) + let buf = self.get(DBDIR_KEY)?; + let mut dir = DbDirectory::des(&buf)?; + if dir.dbs.insert((spcnode, dbnode)) { + let buf = DbDirectory::ser(&dir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } + + self.put(relmap_file_key(spcnode, dbnode), Value::Image(img)); + Ok(()) + } + + pub fn put_twophase_file(&mut self, xid: TransactionId, img: Bytes) -> Result<()> { + // Add it to the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + if !dir.xids.insert(xid) { + bail!("twophase file for xid {} already exists", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + self.put(twophase_file_key(xid), Value::Image(img)); + Ok(()) + } + + pub fn put_control_file(&mut self, img: Bytes) -> Result<()> { + self.put(CONTROLFILE_KEY, Value::Image(img)); + Ok(()) + } + + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<()> { + self.put(CHECKPOINT_KEY, Value::Image(img)); + Ok(()) + } + + pub fn put_dbdir_creation(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + // Create RelDirectory + let dir_key = rel_dir_to_key(spcnode, dbnode); + + let dir = RelDirectory { + rels: HashSet::new(), + }; + let buf: Bytes = RelDirectory::ser(&dir)?.into(); + self.put(dir_key, Value::Image(buf)); + Ok(()) + } + + pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { + // Remove entry from dbdir + let buf = self.get(DBDIR_KEY)?; + let mut dir = DbDirectory::des(&buf)?; + if dir.dbs.remove(&(spcnode, dbnode)) { + let buf = DbDirectory::ser(&dir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + } else { + warn!( + "dropped dbdir for spcnode {} dbnode {} did not exist in db directory", + spcnode, dbnode + ); + } + + // Delete all relations and metadata files for the spcnode/dnode + self.delete(dbdir_key_range(spcnode, dbnode)); + Ok(()) + } + + // When a new relish is created: + // - create/update the directory entry to remember that it exists + // - create relish header to indicate the size (0) + + // When a relish is extended: + // - update relish header with new size + // - insert the block + + // when a relish is truncated: + // - delete truncated blocks + // - update relish header with size + + pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + info!("CREAT: {}", rel); + // Add it to the directory entry + let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let buf = self.get(dir_key)?; + let mut dir = RelDirectory::des(&buf)?; + + if !dir.rels.insert((rel.relnode, rel.forknum)) { + bail!("rel {} already exists", rel); + } + self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + + // Put size + let size_key = rel_size_to_key(rel); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // even if nblocks > 0, we don't insert any actual blocks here + + Ok(()) + } + + /// Truncate relation + pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + // Put size + let size_key = rel_size_to_key(rel); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + Ok(()) + } + + pub fn put_slru_segment_creation( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Add it to the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.insert(segno) { + bail!("slru segment {:?}/{} already exists", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // even if nblocks > 0, we don't insert any actual blocks here + + Ok(()) + } + + /// Extend SLRU segment + pub fn put_slru_extend( + &mut self, + kind: SlruKind, + segno: u32, + nblocks: BlockNumber, + ) -> Result<()> { + // Put size + let size_key = slru_segment_size_to_key(kind, segno); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + Ok(()) + } + + /// Extend relation + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + // Put size + let size_key = rel_size_to_key(rel); + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + Ok(()) + } + + /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records + pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { + // Remove it from the directory entry + let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let buf = self.get(dir_key)?; + let mut dir = RelDirectory::des(&buf)?; + + if dir.rels.remove(&(rel.relnode, rel.forknum)) { + self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + } else { + warn!("dropped rel {} did not exist in rel directory", rel); + } + + // Delete size entry, as well as all blocks + self.delete(rel_key_range(rel)); + + Ok(()) + } + + /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + // TODO + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { + // Remove it from the directory entry + let dir_key = slru_dir_to_key(kind); + let buf = self.get(dir_key)?; + let mut dir = SlruSegmentDirectory::des(&buf)?; + + if !dir.segments.remove(&segno) { + warn!("slru segment {:?}/{} does not exist", kind, segno); + } + self.put( + dir_key, + Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)), + ); + + // Delete size entry, as well as all blocks + self.delete(slru_segment_key_range(kind, segno)); + + Ok(()) + } + + /// This method is used for marking truncated SLRU files + pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { + // Remove it from the directory entry + let buf = self.get(TWOPHASEDIR_KEY)?; + let mut dir = TwoPhaseDirectory::des(&buf)?; + + if !dir.xids.remove(&xid) { + warn!("twophase file for xid {} does not exist", xid); + } + self.put( + TWOPHASEDIR_KEY, + Value::Image(Bytes::from(TwoPhaseDirectory::ser(&dir)?)), + ); + + // Delete it + self.delete(twophase_key_range(xid)); + + Ok(()) + } + + pub fn finish(self) -> Result<()> { + let writer = self.tline.tline.writer(); + + for (key, value) in self.pending_updates { + writer.put(key, self.lsn, value)?; + } + for key_range in self.pending_deletions { + writer.delete(key_range, self.lsn)?; + } + + writer.advance_last_record_lsn(self.lsn); + + Ok(()) + } + + // Internal helper functions to batch the modifications + + fn get(&self, key: Key) -> Result { + // Note: we don't check pending_deletions. It is an error to request a value + // that has been removed, deletion only avoids leaking storage. + + if let Some(value) = self.pending_updates.get(&key) { + if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + bail!("unexpected pending WAL record"); + } + } else { + let last_lsn = self.tline.get_last_record_lsn(); + self.tline.tline.get(key, last_lsn) + } + } + + fn put(&mut self, key: Key, val: Value) { + self.pending_updates.insert(key, val); + } + + fn delete(&mut self, key_range: Range) { + info!("DELETE {}-{}", key_range.start, key_range.end); + self.pending_deletions.push(key_range); + } +} + +// Utilities to pack stuff in Key + +// +// Key space: +// +// blocky stuff: relations and SLRUs +// +// DbDir () -> (dbnode, spcnode) +// +// Filenodemap +// +// RelDir -> relnode forknum +// +// RelBlocks +// +// RelSize +// +// Slrus +// +// SlruDir kind +// +// SlruSegBlocks segno +// +// SlruSegSize +// +// pg_twophase +// +// controlfile +// checkpoint +// + +// DbDir: +// 00 00000000 00000000 00000000 00 00000000 +// +// Filenodemap: +// 00 SPCNODE DBNODE 00000000 00 00000000 +// +// RelDir: +// 00 SPCNODE DBNODE 00000000 00 00000001 +// +// RelBlock: +// 00 SPCNODE DBNODE RELNODE FORK BLKNUM +// +// RelSize: +// 00 SPCNODE DBNODE RELNODE FORK FFFFFFFF +// +// SlruDir: +// 01 kind 00000000 00000000 00 00000000 +// +// SlruSegBlock: +// 01 kind 00000001 SEGNO 00 BLKNUM +// +// SlruSegSize: +// 01 kind 00000001 SEGNO 00 FFFFFFFF +// +// TwoPhaseDir: +// 02 00000000 00000000 00000000 00 00000000 +// +// TwoPhaseFile: +// 02 00000000 00000000 00000000 00 XID +// +// ControlFile: +// 03 00000000 00000000 00000000 00 00000000 +// +// Checkpoint: +// 03 00000000 00000000 00000000 00 00000001 + +const DBDIR_KEY: Key = Key { + field1: 0x00, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 1, + } +} + +pub fn rel_size_to_key(rel: RelTag) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0xffffffff, + } +} + +pub fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +pub fn rel_key_range(rel: RelTag) -> Range { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: 0, + }..Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum + 1, + field6: 0, + } +} + +pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: 0xffffffff, + } +} + +pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { + let field2 = match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }; + + Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x01, + field2, + field3: segno, + field4: 0, + field5: 1, + field6: 0, + } +} + +pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +pub fn twophase_file_key(xid: TransactionId) -> Key { + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + } +} + +pub fn twophase_key_range(xid: TransactionId) -> Range { + let (next_xid, overflowed) = xid.overflowing_add(1); + + Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: xid, + }..Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: if overflowed { 1 } else { 0 }, + field6: next_xid, + } +} + +pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { + Ok(match key.field1 { + 0x00 => ( + RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }, + key.field6, + ), + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { + Ok(match key.field1 { + 0x01 => { + let kind = match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + }; + let segno = key.field4; + let blknum = key.field6; + + (kind, segno, blknum) + } + _ => bail!("unexpected value kind 0x{:02x}", key.field1), + }) +} + +pub fn key_to_relish_block(key: Key) -> Result<(RelishTag, BlockNumber)> { + // FIXME: there's got to be a bitfields crate or something out there to do this for us.. + + // This only works for keys for blocks that are handled by WalRedo manager. + // TODO: assert that the other fields are zero + + Ok(match key.field1 { + 0x00 => ( + RelishTag::Relation(RelTag { + spcnode: key.field2, + dbnode: key.field3, + relnode: key.field4, + forknum: key.field5, + }), + key.field6, + ), + + 0x01 => ( + RelishTag::Slru { + slru: match key.field2 { + 0x00 => SlruKind::Clog, + 0x01 => SlruKind::MultiXactMembers, + 0x02 => SlruKind::MultiXactOffsets, + _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), + }, + segno: key.field4, + }, + key.field6, + ), + + _ => bail!("unrecognized value kind 0x{:02x}", key.field1), + }) +} + +pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, + } +} + +/// +/// Tests that should work the same with any Repository/Timeline implementation. +/// + +#[cfg(test)] +pub fn create_test_timeline( + repo: R, + timeline_id: zenith_utils::zid::ZTimelineId, +) -> Result>> { + let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; + let tline = DatadirTimeline::new(tline); + let mut writer = tline.begin_record(Lsn(8)); + writer.init_empty()?; + + writer.put_dbdir_creation(0, 111)?; + + writer.finish()?; + Ok(Arc::new(tline)) +} + +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + //use super::repo_harness::*; + //use super::*; + + /* + fn assert_current_logical_size(timeline: &DatadirTimeline, lsn: Lsn) { + let incremental = timeline.get_current_logical_size(); + let non_incremental = timeline + .get_current_logical_size_non_incremental(lsn) + .unwrap(); + assert_eq!(incremental, non_incremental); + } + */ + + /* + /// + /// Test list_rels() function, with branches and dropped relations + /// + #[test] + fn test_list_rels_drop() -> Result<()> { + let repo = RepoHarness::create("test_list_rels_drop")?.load(); + let tline = create_empty_timeline(repo, TIMELINE_ID)?; + const TESTDB: u32 = 111; + + // Import initial dummy checkpoint record, otherwise the get_timeline() call + // after branching fails below + let mut writer = tline.begin_record(Lsn(0x10)); + writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; + writer.finish()?; + + // Create a relation on the timeline + let mut writer = tline.begin_record(Lsn(0x20)); + writer.put_rel_page_image(TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + writer.finish()?; + + let writer = tline.begin_record(Lsn(0x00)); + writer.finish()?; + + // Check that list_rels() lists it after LSN 2, but no before it + assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); + assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); + + // Create a branch, check that the relation is visible there + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; + let newtline = DatadirTimelineImpl::new(newtline); + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + + // Drop it on the branch + let mut new_writer = newtline.begin_record(Lsn(0x40)); + new_writer.drop_relation(TESTREL_A)?; + new_writer.finish()?; + + // Check that it's no longer listed on the branch after the point where it was dropped + assert!(newtline + .list_rels(0, TESTDB, Lsn(0x30))? + .contains(&TESTREL_A)); + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + // Run checkpoint and garbage collection and check that it's still not visible + newtline.tline.checkpoint(CheckpointConfig::Forced)?; + repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; + + assert!(!newtline + .list_rels(0, TESTDB, Lsn(0x40))? + .contains(&TESTREL_A)); + + Ok(()) + } + */ + + /* + #[test] + fn test_read_beyond_eof() -> Result<()> { + let repo = RepoHarness::create("test_read_beyond_eof")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + + make_some_layers(&tline, Lsn(0x20))?; + let mut writer = tline.begin_record(Lsn(0x60)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_A, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x60))), + )?; + writer.finish()?; + + // Test read before rel creation. Should error out. + assert!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); + + // Read block beyond end of relation at different points in time. + // These reads should fall into different delta, image, and in-memory layers. + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); + + // Test on an in-memory layer with no preceding layer + let mut writer = tline.begin_record(Lsn(0x70)); + walingest.put_rel_page_image( + &mut writer, + TESTREL_B, + 0, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), + )?; + writer.finish()?; + + assert_eq!(tline.get_rel_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); + + Ok(()) + } + */ +} diff --git a/pageserver/src/remote_storage/local_fs.rs b/pageserver/src/remote_storage/local_fs.rs index 01f6028d17..b6ebd004b5 100644 --- a/pageserver/src/remote_storage/local_fs.rs +++ b/pageserver/src/remote_storage/local_fs.rs @@ -641,7 +641,7 @@ mod fs_tests { } async fn upload_dummy_file( - harness: &RepoHarness, + harness: &RepoHarness<'_>, storage: &LocalFs, name: &str, ) -> anyhow::Result { diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 6b588c8e5f..f6fb112610 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -881,7 +881,7 @@ mod test_utils { #[track_caller] pub async fn ensure_correct_timeline_upload( - harness: &RepoHarness, + harness: &RepoHarness<'_>, remote_assets: Arc<(LocalFs, RwLock)>, timeline_id: ZTimelineId, new_upload: NewCheckpoint, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 49aa04ea7c..b1aedd984a 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,19 +1,128 @@ -use crate::relish::*; -use crate::walrecord::MultiXactMember; +use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; -use anyhow::Result; +use anyhow::{bail, Result}; use bytes::Bytes; -use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId}; use serde::{Deserialize, Serialize}; -use std::collections::HashSet; -use std::ops::AddAssign; +use std::fmt; +use std::ops::{AddAssign, Range}; use std::sync::{Arc, RwLockReadGuard}; use std::time::Duration; use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::zid::ZTimelineId; -/// Block number within a relish. This matches PostgreSQL's BlockNumber type. -pub type BlockNumber = u32; +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)] +/// Key used in the Repository kv-store. +/// +/// The Repository treates this as an opaque struct, but see the code in pgdatadir_mapping.rs +/// for what we actually store in these fields. +pub struct Key { + pub field1: u8, + pub field2: u32, + pub field3: u32, + pub field4: u32, + pub field5: u8, + pub field6: u32, +} + +impl Key { + + pub fn next(&self) -> Key { + let mut key = self.clone(); + + let x = key.field6.overflowing_add(1); + key.field6 = x.0; + if x.1 { + let x = key.field5.overflowing_add(1); + key.field5 = x.0; + if x.1 { + let x = key.field4.overflowing_add(1); + key.field4 = x.0; + if x.1 { + let x = key.field3.overflowing_add(1); + key.field3 = x.0; + if x.1 { + let x = key.field2.overflowing_add(1); + key.field2 = x.0; + if x.1 { + let x = key.field1.overflowing_add(1); + key.field1 = x.0; + assert!(!x.1); + } + } + } + } + } + key + } +} + +impl fmt::Display for Key { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "{:02X}{:08X}{:08X}{:08X}{:02X}{:08X}", + self.field1, self.field2, self.field3, self.field4, self.field5, self.field6 + ) + } +} + +impl Key { + pub const MIN: Key = Key { + field1: u8::MIN, + field2: u32::MIN, + field3: u32::MIN, + field4: u32::MIN, + field5: u8::MIN, + field6: u32::MIN, + }; + pub const MAX: Key = Key { + field1: u8::MAX, + field2: u32::MAX, + field3: u32::MAX, + field4: u32::MAX, + field5: u8::MAX, + field6: u32::MAX, + }; + + pub fn from_hex(s: &str) -> Result { + if s.len() != 36 { + bail!("parse error"); + } + Ok(Key { + field1: u8::from_str_radix(&s[0..2], 16)?, + field2: u32::from_str_radix(&s[2..10], 16)?, + field3: u32::from_str_radix(&s[10..18], 16)?, + field4: u32::from_str_radix(&s[18..26], 16)?, + field5: u8::from_str_radix(&s[26..28], 16)?, + field6: u32::from_str_radix(&s[28..36], 16)?, + }) + } + + pub fn to_prefix_128(&self) -> u128 { + assert!(self.field1 & 0xf0 == 0); + (self.field1 as u128) << 124 + | (self.field2 as u128) << 92 + | (self.field3 as u128) << 60 + | (self.field4 as u128) << 28 + | (self.field5 as u128) << 20 + | (self.field6 as u128) >> 12 + } +} + +// +// There are two kinds of values: incremental and non-incremental +// +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum Value { + Image(Bytes), + WalRecord(ZenithWalRecord), +} + +impl Value { + pub fn is_image(&self) -> bool { + matches!(self, Value::Image(_)) + } +} /// /// A repository corresponds to one .zenith directory. One repository holds multiple @@ -132,42 +241,22 @@ impl TimelineSyncState { /// #[derive(Default)] pub struct GcResult { - pub ondisk_relfiles_total: u64, - pub ondisk_relfiles_needed_by_cutoff: u64, - pub ondisk_relfiles_needed_by_branches: u64, - pub ondisk_relfiles_not_updated: u64, - pub ondisk_relfiles_needed_as_tombstone: u64, - pub ondisk_relfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_relfiles_dropped: u64, // # of layer files removed because the relation was dropped - - pub ondisk_nonrelfiles_total: u64, - pub ondisk_nonrelfiles_needed_by_cutoff: u64, - pub ondisk_nonrelfiles_needed_by_branches: u64, - pub ondisk_nonrelfiles_not_updated: u64, - pub ondisk_nonrelfiles_needed_as_tombstone: u64, - pub ondisk_nonrelfiles_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - pub ondisk_nonrelfiles_dropped: u64, // # of layer files removed because the relation was dropped + pub layers_total: u64, + pub layers_needed_by_cutoff: u64, + pub layers_needed_by_branches: u64, + pub layers_not_updated: u64, + pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. pub elapsed: Duration, } impl AddAssign for GcResult { fn add_assign(&mut self, other: Self) { - self.ondisk_relfiles_total += other.ondisk_relfiles_total; - self.ondisk_relfiles_needed_by_cutoff += other.ondisk_relfiles_needed_by_cutoff; - self.ondisk_relfiles_needed_by_branches += other.ondisk_relfiles_needed_by_branches; - self.ondisk_relfiles_not_updated += other.ondisk_relfiles_not_updated; - self.ondisk_relfiles_needed_as_tombstone += other.ondisk_relfiles_needed_as_tombstone; - self.ondisk_relfiles_removed += other.ondisk_relfiles_removed; - self.ondisk_relfiles_dropped += other.ondisk_relfiles_dropped; - - self.ondisk_nonrelfiles_total += other.ondisk_nonrelfiles_total; - self.ondisk_nonrelfiles_needed_by_cutoff += other.ondisk_nonrelfiles_needed_by_cutoff; - self.ondisk_nonrelfiles_needed_by_branches += other.ondisk_nonrelfiles_needed_by_branches; - self.ondisk_nonrelfiles_not_updated += other.ondisk_nonrelfiles_not_updated; - self.ondisk_nonrelfiles_needed_as_tombstone += other.ondisk_nonrelfiles_needed_as_tombstone; - self.ondisk_nonrelfiles_removed += other.ondisk_nonrelfiles_removed; - self.ondisk_nonrelfiles_dropped += other.ondisk_nonrelfiles_dropped; + self.layers_total += other.layers_total; + self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; + self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_not_updated += other.layers_not_updated; + self.layers_removed += other.layers_removed; self.elapsed += other.elapsed; } @@ -190,23 +279,14 @@ pub trait Timeline: Send + Sync { fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard; /// Look up given page version. - fn get_page_at_lsn(&self, tag: RelishTag, blknum: BlockNumber, lsn: Lsn) -> Result; - - /// Get size of a relish - fn get_relish_size(&self, tag: RelishTag, lsn: Lsn) -> Result>; - - /// Does relation exist? - fn get_rel_exists(&self, tag: RelishTag, lsn: Lsn) -> Result; - - /// Get a list of all existing relations - /// Pass RelTag to get relation objects or None to get nonrels. - fn list_relishes(&self, tag: Option, lsn: Lsn) -> Result>; - - /// Get a list of all existing relations in given tablespace and database. - fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result>; - - /// Get a list of all existing non-relational objects - fn list_nonrels(&self, lsn: Lsn) -> Result>; + /// + /// NOTE: It is considerd an error to 'get' a key that doesn't exist. The abstraction + /// above this needs to store suitable metadata to track what data exists with + /// what keys, in separate metadata entries. If a non-existent key is requested, + /// the Repository implementation may incorrectly return a value from an ancestore + /// branch, for exampel, or waste a lot of cycles chasing the non-existing key. + /// + fn get(&self, key: Key, lsn: Lsn) -> Result; /// Get the ancestor's timeline id fn get_ancestor_timeline_id(&self) -> Option; @@ -243,6 +323,8 @@ pub trait Timeline: Send + Sync { /// know anything about them here in the repository. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; + fn create_images(&self, threshold: usize) -> Result<()>; + /// /// Check that it is valid to request operations with that lsn. fn check_lsn_is_in_scope( @@ -250,16 +332,6 @@ pub trait Timeline: Send + Sync { lsn: Lsn, latest_gc_cutoff_lsn: &RwLockReadGuard, ) -> Result<()>; - - /// Retrieve current logical size of the timeline - /// - /// NOTE: counted incrementally, includes ancestors, - /// doesnt support TwoPhase relishes yet - fn get_current_logical_size(&self) -> usize; - - /// Does the same as get_current_logical_size but counted on demand. - /// Used in tests to ensure that incremental and non incremental variants match. - fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result; } /// Various functions to mutate the timeline. @@ -271,28 +343,9 @@ pub trait TimelineWriter<'a> { /// /// This will implicitly extend the relation, if the page is beyond the /// current end-of-file. - fn put_wal_record( - &self, - lsn: Lsn, - tag: RelishTag, - blknum: BlockNumber, - rec: ZenithWalRecord, - ) -> Result<()>; + fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>; - /// Like put_wal_record, but with ready-made image of the page. - fn put_page_image( - &self, - tag: RelishTag, - blknum: BlockNumber, - lsn: Lsn, - img: Bytes, - ) -> Result<()>; - - /// Truncate relation - fn put_truncation(&self, rel: RelishTag, lsn: Lsn, nblocks: BlockNumber) -> Result<()>; - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - fn drop_relish(&self, tag: RelishTag, lsn: Lsn) -> Result<()>; + fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; /// Track end of the latest digested WAL record. /// @@ -301,53 +354,14 @@ pub trait TimelineWriter<'a> { fn advance_last_record_lsn(&self, lsn: Lsn); } -/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom zenith-specific "record". -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum ZenithWalRecord { - /// Native PostgreSQL WAL record - Postgres { will_init: bool, rec: Bytes }, - - /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) - ClearVisibilityMapFlags { - new_heap_blkno: Option, - old_heap_blkno: Option, - flags: u8, - }, - /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { xids: Vec }, - /// Mark transaction IDs as aborted on a CLOG page - ClogSetAborted { xids: Vec }, - /// Extend multixact offsets SLRU - MultixactOffsetCreate { - mid: MultiXactId, - moff: MultiXactOffset, - }, - /// Extend multixact members SLRU. - MultixactMembersCreate { - moff: MultiXactOffset, - members: Vec, - }, -} - -impl ZenithWalRecord { - /// Does replaying this WAL record initialize the page from scratch, or does - /// it need to be applied over the previous image of the page? - pub fn will_init(&self) -> bool { - match self { - ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, - - // None of the special zenith record types currently initialize the page - _ => false, - } - } -} - #[cfg(test)] pub mod repo_harness { use bytes::BytesMut; + use lazy_static::lazy_static; + use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard}; use std::{fs, path::PathBuf}; + use crate::RepositoryImpl; use crate::{ config::PageServerConf, layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME}, @@ -373,13 +387,34 @@ pub mod repo_harness { buf.freeze() } - pub struct RepoHarness { - pub conf: &'static PageServerConf, - pub tenant_id: ZTenantId, + lazy_static! { + static ref LOCK: RwLock<()> = RwLock::new(()); } - impl RepoHarness { + pub struct RepoHarness<'a> { + pub conf: &'static PageServerConf, + pub tenant_id: ZTenantId, + + pub lock_guard: ( + Option>, + Option>, + ), + } + + impl<'a> RepoHarness<'a> { pub fn create(test_name: &'static str) -> Result { + Self::create_internal(test_name, false) + } + pub fn create_exclusive(test_name: &'static str) -> Result { + Self::create_internal(test_name, true) + } + fn create_internal(test_name: &'static str, exclusive: bool) -> Result { + let lock_guard = if exclusive { + (None, Some(LOCK.write().unwrap())) + } else { + (Some(LOCK.read().unwrap()), None) + }; + let repo_dir = PageServerConf::test_repo_dir(test_name); let _ = fs::remove_dir_all(&repo_dir); fs::create_dir_all(&repo_dir)?; @@ -394,18 +429,17 @@ pub mod repo_harness { fs::create_dir_all(conf.tenant_path(&tenant_id))?; fs::create_dir_all(conf.branches_path(&tenant_id))?; - Ok(Self { conf, tenant_id }) + Ok(Self { + conf, + tenant_id, + lock_guard, + }) } - pub fn load(&self) -> LayeredRepository { + pub fn load(&self) -> RepositoryImpl { let walredo_mgr = Arc::new(TestRedoManager); - LayeredRepository::new( - self.conf, - walredo_mgr, - self.tenant_id, - false, - ) + LayeredRepository::new(self.conf, walredo_mgr, self.tenant_id, false) } pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf { @@ -414,21 +448,19 @@ pub mod repo_harness { } // Mock WAL redo manager that doesn't do much - struct TestRedoManager; + pub struct TestRedoManager; impl WalRedoManager for TestRedoManager { fn request_redo( &self, - rel: RelishTag, - blknum: BlockNumber, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, ) -> Result { let s = format!( - "redo for {} blk {} to get to {}, with {} and {} records", - rel, - blknum, + "redo for {} to get to {}, with {} and {} records", + key, lsn, if base_img.is_some() { "base image" @@ -438,6 +470,7 @@ pub mod repo_harness { records.len() ); println!("{}", s); + Ok(TEST_IMG(&s)) } } @@ -451,412 +484,40 @@ pub mod repo_harness { mod tests { use super::repo_harness::*; use super::*; - use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; - use std::fs; - - /// Arbitrary relation tag, for testing. - const TESTREL_A_REL_TAG: RelTag = RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1000, - forknum: 0, - }; - const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG); - const TESTREL_B: RelishTag = RelishTag::Relation(RelTag { - spcnode: 0, - dbnode: 111, - relnode: 1001, - forknum: 0, - }); - - fn assert_current_logical_size(timeline: &Arc, lsn: Lsn) { - let incremental = timeline.get_current_logical_size(); - let non_incremental = timeline - .get_current_logical_size_non_incremental(lsn) - .unwrap(); - assert_eq!(incremental, non_incremental); - } - - static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; + //use std::sync::Arc; + use bytes::BytesMut; #[test] - fn test_relsize() -> Result<()> { - let repo = RepoHarness::create("test_relsize")?.load(); - // get_timeline() with non-existent timeline id should fail - //repo.get_timeline("11223344556677881122334455667788"); - - // Create timeline to work on + fn test_basic() -> Result<()> { + let repo = RepoHarness::create("test_basic")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + #[allow(non_snake_case)] + let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + let writer = tline.writer(); + writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.advance_last_record_lsn(Lsn(0x10)); + drop(writer); - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?; - writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?; - - writer.advance_last_record_lsn(Lsn(0x50)); - - assert_current_logical_size(&tline, Lsn(0x50)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - - // Check page contents at each LSN - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, - TEST_IMG("foo blk 0 at 2") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, - TEST_IMG("foo blk 0 at 3") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, - TEST_IMG("foo blk 1 at 4") - ); - - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, - TEST_IMG("foo blk 1 at 4") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate last block - writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?; - writer.advance_last_record_lsn(Lsn(0x60)); - assert_current_logical_size(&tline, Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 2); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, - TEST_IMG("foo blk 0 at 3") - ); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, - TEST_IMG("foo blk 1 at 4") - ); - - // should still see the truncated block with older LSN - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), 3); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, - TEST_IMG("foo blk 2 at 5") - ); - - // Truncate to zero length - writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?; - writer.advance_last_record_lsn(Lsn(0x68)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0); - - // Extend from 0 to 2 blocks, leaving a gap - writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?; - writer.advance_last_record_lsn(Lsn(0x70)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, - TEST_IMG("foo blk 1") - ); - - // Extend a lot more, leaving a big gap that spans across segments - // FIXME: This is currently broken, see https://github.com/zenithdb/zenith/issues/500 - /* - tline.put_page_image(TESTREL_A, 1500, Lsn(0x80), TEST_IMG("foo blk 1500"))?; - tline.advance_last_record_lsn(Lsn(0x80)); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), 1501); - for blk in 2..1500 { - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, - ZERO_PAGE); - } - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, - TEST_IMG("foo blk 1500")); - */ - - Ok(()) - } - - // Test what happens if we dropped a relation - // and then created it again within the same layer. - #[test] - fn test_drop_extend() -> Result<()> { - let repo = RepoHarness::create("test_drop_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); - - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; + writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; writer.advance_last_record_lsn(Lsn(0x20)); + drop(writer); - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1); - - // Drop relish - writer.drop_relish(TESTREL_A, Lsn(0x30))?; - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that rel is not visible anymore - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none()); - - // Extend it again - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - writer.advance_last_record_lsn(Lsn(0x40)); - - // Check that rel exists and size is correct - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x40))?.unwrap(), 1); + assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); Ok(()) } - // Test what happens if we truncated a relation - // so that one of its segments was dropped - // and then extended it again within the same layer. - #[test] - fn test_truncate_extend() -> Result<()> { - let repo = RepoHarness::create("test_truncate_extend")?.load(); - - // Create timeline to work on - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - //from storage_layer.rs - const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; - let relsize = RELISH_SEG_SIZE * 2; - - // Create relation with relsize blocks - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - - writer.advance_last_record_lsn(Lsn(0x20)); - - // The relation was created at LSN 2, not visible at LSN 1 yet. - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); - assert!(tline.get_relish_size(TESTREL_A, Lsn(0x10))?.is_none()); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), - relsize - ); - - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, lsn)?, - TEST_IMG(&data) - ); - } - - // Truncate relation so that second segment was dropped - // - only leave one page - writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?; - writer.advance_last_record_lsn(Lsn(0x60)); - - // Check reported size and contents after truncation - assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1); - - for blkno in 0..1 { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, - TEST_IMG(&data) - ); - } - - // should still see all blocks with older LSN - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x50))?.unwrap(), - relsize - ); - for blkno in 0..relsize { - let lsn = Lsn(0x20); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, - TEST_IMG(&data) - ); - } - - // Extend relation again. - // Add enough blocks to create second segment - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?; - } - writer.advance_last_record_lsn(Lsn(0x80)); - - assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(0x80))?.unwrap(), - relsize - ); - // Check relation content - for blkno in 0..relsize { - let lsn = Lsn(0x80); - let data = format!("foo blk {} at {}", blkno, lsn); - assert_eq!( - tline.get_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, - TEST_IMG(&data) - ); - } - - Ok(()) - } - - /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's - /// split into multiple 1 GB segments in Postgres. - #[test] - fn test_large_rel() -> Result<()> { - let repo = RepoHarness::create("test_large_rel")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - - let mut lsn = 0x10; - for blknum in 0..pg_constants::RELSEG_SIZE + 1 { - lsn += 0x10; - let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); - writer.put_page_image(TESTREL_A, blknum as BlockNumber, Lsn(lsn), img)?; - } - writer.advance_last_record_lsn(Lsn(lsn)); - - assert_current_logical_size(&tline, Lsn(lsn)); - - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE + 1 - ); - - // Truncate one block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate another block - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - pg_constants::RELSEG_SIZE - 1 - ); - assert_current_logical_size(&tline, Lsn(lsn)); - - // Truncate to 1500, and then truncate all the way down to 0, one block at a time - // This tests the behavior at segment boundaries - let mut size: i32 = 3000; - while size >= 0 { - lsn += 0x10; - writer.put_truncation(TESTREL_A, Lsn(lsn), size as BlockNumber)?; - writer.advance_last_record_lsn(Lsn(lsn)); - assert_eq!( - tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(), - size as BlockNumber - ); - - size -= 1; - } - assert_current_logical_size(&tline, Lsn(lsn)); - - Ok(()) - } - - /// - /// Test list_rels() function, with branches and dropped relations - /// - #[test] - fn test_list_rels_drop() -> Result<()> { - let repo = RepoHarness::create("test_list_rels_drop")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - let writer = tline.writer(); - const TESTDB: u32 = 111; - - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; - - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - - writer.advance_last_record_lsn(Lsn(0x30)); - - // Check that list_rels() lists it after LSN 2, but no before it - assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x20))?.contains(&TESTREL_A)); - assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A)); - - // Create a branch, check that the relation is visible there - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - let new_writer = newtline.writer(); - - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - - // Drop it on the branch - new_writer.drop_relish(TESTREL_A, Lsn(0x40))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); - - drop(new_writer); - - // Check that it's no longer listed on the branch after the point where it was dropped - assert!(newtline - .list_rels(0, TESTDB, Lsn(0x30))? - .contains(&TESTREL_A)); - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - // Run checkpoint and garbage collection and check that it's still not visible - newtline.checkpoint(CheckpointConfig::Forced)?; - repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?; - - assert!(!newtline - .list_rels(0, TESTDB, Lsn(0x40))? - .contains(&TESTREL_A)); - - Ok(()) + /// Convenience function to create a page image with given string as the only content + pub fn test_value(s: &str) -> Value { + let mut buf = BytesMut::new(); + buf.extend_from_slice(s.as_bytes()); + Value::Image(buf.freeze()) } /// @@ -867,21 +528,24 @@ mod tests { let repo = RepoHarness::create("test_branch")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; let writer = tline.writer(); + use std::str::from_utf8; - // Import initial dummy checkpoint record, otherwise the get_timeline() call - // after branching fails below - writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?; + #[allow(non_snake_case)] + let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); + #[allow(non_snake_case)] + let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap(); - // Create a relation on the timeline - writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?; - writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?; - - // Create another relation - writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?; + // Insert a value on the timeline + writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?; + writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?; + writer.advance_last_record_lsn(Lsn(0x20)); + writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?; + writer.advance_last_record_lsn(Lsn(0x30)); + writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?; writer.advance_last_record_lsn(Lsn(0x40)); - assert_current_logical_size(&tline, Lsn(0x40)); + + //assert_current_logical_size(&tline, Lsn(0x40)); // Branch the history, modify relation differently on the new timeline repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?; @@ -890,246 +554,145 @@ mod tests { None => panic!("Should have a local timeline"), }; let new_writer = newtline.writer(); - - new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?; + new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?; new_writer.advance_last_record_lsn(Lsn(0x40)); // Check page contents on both branches assert_eq!( - tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("foo blk 0 at 4") + from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?, + "foo at 0x40" ); - assert_eq!( - newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, - TEST_IMG("bar blk 0 at 4") + from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?, + "bar at 0x40" ); - assert_eq!( - newtline.get_page_at_lsn(TESTREL_B, 0, Lsn(0x40))?, - TEST_IMG("foobar blk 0 at 2") + from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?, + "foobar at 0x20" ); - assert_eq!(newtline.get_relish_size(TESTREL_B, Lsn(0x40))?.unwrap(), 1); - - assert_current_logical_size(&tline, Lsn(0x40)); + //assert_current_logical_size(&tline, Lsn(0x40)); Ok(()) } - fn make_some_layers(tline: &Arc, start_lsn: Lsn) -> Result<()> { - let mut lsn = start_lsn; - { - let writer = tline.writer(); - // Create a relation on the timeline - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - writer.advance_last_record_lsn(lsn); - } - tline.checkpoint(CheckpointConfig::Forced)?; - { - let writer = tline.writer(); - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - lsn += 0x10; - writer.put_page_image( - TESTREL_A, - 0, - lsn, - TEST_IMG(&format!("foo blk 0 at {}", lsn)), - )?; - writer.advance_last_record_lsn(lsn); - } - tline.checkpoint(CheckpointConfig::Forced) - } + /* // FIXME: Garbage collection is broken + #[test] + fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load_page_repo(); - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(&tline, Lsn(0x20))?; - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - - // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + // try to branch at lsn 25, should fail because we already garbage collected the data + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(err.to_string().contains("invalid branch start lsn")); + assert!(err + .source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data")) + } } + + Ok(()) } - Ok(()) - } + #[test] + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { + let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load_page_repo(); - #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); - - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; - // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC horizon")); + repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(&err.to_string().contains("invalid branch start lsn")); + assert!(&err + .source() + .unwrap() + .to_string() + .contains("is earlier than latest GC horizon")); + } } + + Ok(()) } - Ok(()) - } + #[test] + fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")? + .load_page_repo(); - #[test] - fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")? - .load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(&tline, Lsn(0x20))?; - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; - - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { - Ok(_) => panic!("request for page should have failed"), - Err(err) => assert!(err.to_string().contains("not found at")), - } - Ok(()) - } - - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); - - Ok(()) - } - - #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?; - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; - - make_some_layers(&newtline, Lsn(0x60))?; - - // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - - // check that the layer in parent before the branching point is still there - let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id); - - let expected_image_layer_path = tline_dir.join(format!( - "rel_{}_{}_{}_{}_{}_{:016X}_{:016X}", - TESTREL_A_REL_TAG.spcnode, - TESTREL_A_REL_TAG.dbnode, - TESTREL_A_REL_TAG.relnode, - TESTREL_A_REL_TAG.forknum, - 0, // seg is 0 - 0x20, - 0x30, - )); - assert!(fs::metadata(&expected_image_layer_path).is_ok()); - - Ok(()) - } - - #[test] - fn test_read_beyond_eof() -> Result<()> { - let harness = RepoHarness::create("test_read_beyond_eof")?; - let repo = harness.load(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - - make_some_layers(&tline, Lsn(0x20))?; - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_A, - 0, - Lsn(0x60), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x50))), - )?; - writer.advance_last_record_lsn(Lsn(0x60)); + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); + assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + // FIXME: GC is currently disabled, so this still works + /* + match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { + Ok(_) => panic!("request for page should have failed"), + Err(err) => assert!(err.to_string().contains("not found at")), + } + */ + Ok(()) } - // Test read before rel creation. Should error out. - assert!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x10)).is_err()); + #[test] + fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { + let repo = + RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load_page_repo(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(&tline, Lsn(0x20))?; - // Read block beyond end of relation at different points in time. - // These reads should fall into different delta, image, and in-memory layers. - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x20))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x25))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x30))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x35))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x45))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x55))?, ZERO_PAGE); - assert_eq!(tline.get_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, ZERO_PAGE); + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; - // Test on an in-memory layer with no preceding layer - { - let writer = tline.writer(); - writer.put_page_image( - TESTREL_B, - 0, - Lsn(0x70), - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x70))), - )?; - writer.advance_last_record_lsn(Lsn(0x70)); + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); + + Ok(()) } - assert_eq!(tline.get_page_at_lsn(TESTREL_B, 1, Lsn(0x70))?, ZERO_PAGE); - Ok(()) - } + #[test] + fn test_parent_keeps_data_forever_after_branching() -> Result<()> { + let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?; + let repo = harness.load_page_repo(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(&tline, Lsn(0x20))?; + + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; + + make_some_layers(&newtline, Lsn(0x60))?; + + // run gc on parent + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, + TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x40))) + ); + + Ok(()) + } + + */ } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c0b54278cd..7adea39b6a 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -2,14 +2,15 @@ //! page server. use crate::branches; -use crate::{RepositoryImpl, TimelineImpl}; use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; -use crate::repository::{Repository, TimelineSyncState}; +use crate::repository::Repository; +use crate::repository::TimelineSyncState; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; +use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{bail, Context, Result}; use lazy_static::lazy_static; use log::*; @@ -26,6 +27,8 @@ lazy_static! { struct Tenant { state: TenantState, repo: Arc, + + timelines: HashMap>, } #[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] @@ -79,15 +82,17 @@ pub fn set_timeline_states( let walredo_mgr = PostgresRedoManager::new(conf, tenant_id); // Set up an object repository, for actual data storage. - let repo: Arc = Arc::new(LayeredRepository::new( + let repo = LayeredRepository::new( conf, Arc::new(walredo_mgr), tenant_id, conf.remote_storage_config.is_some(), - )); + ); + Tenant { state: TenantState::Idle, - repo, + repo: Arc::new(repo), + timelines: HashMap::new(), } }); if let Err(e) = put_timelines_into_tenant(tenant, tenant_id, timeline_states) { @@ -191,6 +196,7 @@ pub fn create_repository_for_tenant( v.insert(Tenant { state: TenantState::Idle, repo, + timelines: HashMap::new(), }); } } @@ -261,11 +267,25 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result Result> { - get_repository_for_tenant(tenantid)? +) -> Result> { + let mut m = access_tenants(); + let tenant = m + .get_mut(&tenantid) + .with_context(|| format!("Tenant not found for tenant {}", tenantid))?; + + if let Some(page_tline) = tenant.timelines.get(&timelineid) { + return Ok(Arc::clone(page_tline)); + } + // First access to this timeline. Create a DatadirTimeline wrapper for it + let tline = tenant + .repo .get_timeline(timelineid)? .local_timeline() - .with_context(|| format!("cannot fetch timeline {}", timelineid)) + .with_context(|| format!("cannot fetch timeline {}", timelineid))?; + + let page_tline = Arc::new(DatadirTimelineImpl::new(tline)); + tenant.timelines.insert(timelineid, Arc::clone(&page_tline)); + Ok(page_tline) } #[derive(Serialize, Deserialize, Clone)] diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index 673a92b80d..a6711f0542 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -14,6 +14,15 @@ use zenith_utils::zid::ZTenantId; /// Checkpointer thread's main loop /// pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { + if let Err(err) = checkpoint_loop_ext(tenantid, conf) { + error!("checkpoint loop terminated with error: {:?}", err); + Err(err) + } else { + Ok(()) + } +} + +fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { loop { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { break; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 615a9960fe..92133a3fd7 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -23,14 +23,16 @@ use postgres_ffi::nonrelfile_utils::clogpage_precedes; use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment; -use std::cmp::min; use anyhow::Result; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; +use std::collections::HashMap; + +use crate::pgdatadir_mapping::*; use crate::relish::*; -use crate::repository::*; +use crate::repository::Repository; use crate::walrecord::*; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment; use postgres_ffi::xlog_utils::*; @@ -40,22 +42,28 @@ use zenith_utils::lsn::Lsn; static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); -pub struct WalIngest { +pub struct WalIngest<'a, R: Repository> { + timeline: &'a DatadirTimeline, + checkpoint: CheckPoint, checkpoint_modified: bool, + + relsize_cache: HashMap, } -impl WalIngest { - pub fn new(timeline: &T, startpoint: Lsn) -> Result { +impl<'a, R: Repository> WalIngest<'a, R> { + pub fn new(timeline: &DatadirTimeline, startpoint: Lsn) -> Result> { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. - let checkpoint_bytes = timeline.get_page_at_lsn(RelishTag::Checkpoint, 0, startpoint)?; + let checkpoint_bytes = timeline.get_checkpoint(startpoint)?; let checkpoint = CheckPoint::decode(&checkpoint_bytes)?; trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value); Ok(WalIngest { + timeline, checkpoint, checkpoint_modified: false, + relsize_cache: HashMap::new(), }) } @@ -66,13 +74,14 @@ impl WalIngest { /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the /// relations/pages that the record affects. /// - pub fn ingest_record( + pub fn ingest_record( &mut self, - timeline: &T, - writer: &dyn TimelineWriter, + timeline: &DatadirTimeline, recdata: Bytes, lsn: Lsn, ) -> Result<()> { + let mut writer = timeline.begin_record(lsn); + let mut decoded = decode_wal_record(recdata); let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -87,48 +96,34 @@ impl WalIngest { if decoded.xl_rmid == pg_constants::RM_HEAP_ID || decoded.xl_rmid == pg_constants::RM_HEAP2_ID { - self.ingest_heapam_record(&mut buf, writer, lsn, &mut decoded)?; + self.ingest_heapam_record(&mut buf, &mut writer, &mut decoded)?; } // Handle other special record types if decoded.xl_rmid == pg_constants::RM_SMGR_ID + && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) + == pg_constants::XLOG_SMGR_CREATE + { + let create = XlSmgrCreate::decode(&mut buf); + self.ingest_xlog_smgr_create(&mut writer, &create)?; + } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_SMGR_TRUNCATE { let truncate = XlSmgrTruncate::decode(&mut buf); - self.ingest_xlog_smgr_truncate(writer, lsn, &truncate)?; + self.ingest_xlog_smgr_truncate(&mut writer, &truncate)?; } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID { if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_CREATE { let createdb = XlCreateDatabase::decode(&mut buf); - self.ingest_xlog_dbase_create(timeline, writer, lsn, &createdb)?; + self.ingest_xlog_dbase_create(&mut writer, &createdb)?; } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK) == pg_constants::XLOG_DBASE_DROP { let dropdb = XlDropDatabase::decode(&mut buf); - - // To drop the database, we need to drop all the relations in it. Like in - // ingest_xlog_dbase_create(), use the previous record's LSN in the list_rels() call - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - for tablespace_id in dropdb.tablespace_ids { - let rels = timeline.list_rels(tablespace_id, dropdb.db_id, req_lsn)?; - for rel in rels { - writer.drop_relish(rel, lsn)?; - } - trace!( - "Drop FileNodeMap {}, {} at lsn {}", - tablespace_id, - dropdb.db_id, - lsn - ); - writer.drop_relish( - RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: dropdb.db_id, - }, - lsn, - )?; + trace!("Drop db {}, {}", tablespace_id, dropdb.db_id); + writer.drop_dbdir(tablespace_id, dropdb.db_id)?; } } } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID { @@ -139,19 +134,17 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - writer.put_page_image( - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + self.put_slru_page_image( + &mut writer, + SlruKind::Clog, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else { assert!(info == pg_constants::CLOG_TRUNCATE); let xlrec = XlClogTruncate::decode(&mut buf); - self.ingest_clog_truncate_record(timeline, writer, lsn, &xlrec)?; + self.ingest_clog_truncate_record(&mut writer, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_XACT_ID { let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK; @@ -159,8 +152,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - writer, - lsn, + &mut writer, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT, )?; @@ -170,8 +162,7 @@ impl WalIngest { let parsed_xact = XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info); self.ingest_xact_record( - writer, - lsn, + &mut writer, &parsed_xact, info == pg_constants::XLOG_XACT_COMMIT_PREPARED, )?; @@ -180,23 +171,11 @@ impl WalIngest { "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}", decoded.xl_xid, parsed_xact.xid, - lsn + lsn, ); - writer.drop_relish( - RelishTag::TwoPhase { - xid: parsed_xact.xid, - }, - lsn, - )?; + writer.drop_twophase_file(parsed_xact.xid)?; } else if info == pg_constants::XLOG_XACT_PREPARE { - writer.put_page_image( - RelishTag::TwoPhase { - xid: decoded.xl_xid, - }, - 0, - lsn, - Bytes::copy_from_slice(&buf[..]), - )?; + writer.put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))?; } } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; @@ -205,38 +184,34 @@ impl WalIngest { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - writer.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + self.put_slru_page_image( + &mut writer, + SlruKind::MultiXactOffsets, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE { let pageno = buf.get_u32_le(); let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - writer.put_page_image( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno, - }, + self.put_slru_page_image( + &mut writer, + SlruKind::MultiXactMembers, + segno, rpageno, - lsn, ZERO_PAGE.clone(), )?; } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID { let xlrec = XlMultiXactCreate::decode(&mut buf); - self.ingest_multixact_create_record(writer, lsn, &xlrec)?; + self.ingest_multixact_create_record(&mut writer, &xlrec)?; } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID { let xlrec = XlMultiXactTruncate::decode(&mut buf); - self.ingest_multixact_truncate_record(writer, lsn, &xlrec)?; + self.ingest_multixact_truncate_record(&mut writer, &xlrec)?; } } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(writer, lsn, &xlrec, &decoded)?; + self.ingest_relmap_page(&mut writer, &xlrec, &decoded)?; } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_NEXTOID { @@ -271,37 +246,37 @@ impl WalIngest { // Iterate through all the blocks that the record modifies, and // "put" a separate copy of the record for each block. for blk in decoded.blocks.iter() { - self.ingest_decoded_block(writer, lsn, &decoded, blk)?; + self.ingest_decoded_block(&mut writer, lsn, &decoded, blk)?; } // If checkpoint data was updated, store the new version in the repository if self.checkpoint_modified { let new_checkpoint_bytes = self.checkpoint.encode(); - writer.put_page_image(RelishTag::Checkpoint, 0, lsn, new_checkpoint_bytes)?; + writer.put_checkpoint(new_checkpoint_bytes)?; self.checkpoint_modified = false; } // Now that this record has been fully handled, including updating the // checkpoint data, let the repository know that it is up-to-date to this LSN - writer.advance_last_record_lsn(lsn); + writer.finish()?; Ok(()) } fn ingest_decoded_block( &mut self, - timeline: &dyn TimelineWriter, + timeline: &mut DatadirTimelineWriter, lsn: Lsn, decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, ) -> Result<()> { - let tag = RelishTag::Relation(RelTag { + let rel = RelTag { spcnode: blk.rnode_spcnode, dbnode: blk.rnode_dbnode, relnode: blk.rnode_relnode, forknum: blk.forknum as u8, - }); + }; // // Instead of storing full-page-image WAL record, @@ -331,13 +306,13 @@ impl WalIngest { image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes()); image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes()); assert_eq!(image.len(), pg_constants::BLCKSZ as usize); - timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?; + self.put_rel_page_image(timeline, rel, blk.blkno, image.freeze())?; } else { let rec = ZenithWalRecord::Postgres { will_init: blk.will_init || blk.apply_image, rec: decoded.record.clone(), }; - timeline.put_wal_record(lsn, tag, blk.blkno, rec)?; + self.put_rel_wal_record(timeline, rel, blk.blkno, rec)?; } Ok(()) } @@ -345,8 +320,7 @@ impl WalIngest { fn ingest_heapam_record( &mut self, buf: &mut Bytes, - timeline: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, decoded: &mut DecodedWALRecord, ) -> Result<()> { // Handle VM bit updates that are implicitly part of heap records. @@ -410,21 +384,21 @@ impl WalIngest { // Clear the VM bits if required. if new_heap_blkno.is_some() || old_heap_blkno.is_some() { - let vm_relish = RelishTag::Relation(RelTag { + let vm_rel = RelTag { forknum: pg_constants::VISIBILITYMAP_FORKNUM, spcnode: decoded.blocks[0].rnode_spcnode, dbnode: decoded.blocks[0].rnode_dbnode, relnode: decoded.blocks[0].rnode_relnode, - }); + }; let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); if new_vm_blk == old_vm_blk { // An UPDATE record that needs to clear the bits for both old and the // new page, both of which reside on the same VM page. - timeline.put_wal_record( - lsn, - vm_relish, + self.put_rel_wal_record( + timeline, + vm_rel, new_vm_blk.unwrap(), ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno, @@ -436,9 +410,9 @@ impl WalIngest { // Clear VM bits for one heap page, or for two pages that reside on // different VM pages. if let Some(new_vm_blk) = new_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, + self.put_rel_wal_record( + timeline, + vm_rel, new_vm_blk, ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno, @@ -448,9 +422,9 @@ impl WalIngest { )?; } if let Some(old_vm_blk) = old_vm_blk { - timeline.put_wal_record( - lsn, - vm_relish, + self.put_rel_wal_record( + timeline, + vm_rel, old_vm_blk, ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno: None, @@ -466,11 +440,9 @@ impl WalIngest { } /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record. - fn ingest_xlog_dbase_create( + fn ingest_xlog_dbase_create( &mut self, - timeline: &T, - writer: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, rec: &XlCreateDatabase, ) -> Result<()> { let db_id = rec.db_id; @@ -483,76 +455,75 @@ impl WalIngest { // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for // the last valid LSN to advance up to it. So we use the previous record's LSN in the // get calls instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); + let req_lsn = timeline.get_last_record_lsn(); let rels = timeline.list_rels(src_tablespace_id, src_db_id, req_lsn)?; - trace!("ingest_xlog_dbase_create: {} rels", rels.len()); + debug!("ingest_xlog_dbase_create: {} rels", rels.len()); + + timeline.put_dbdir_creation(tablespace_id, db_id)?; let mut num_rels_copied = 0; let mut num_blocks_copied = 0; - for rel in rels { - if let RelishTag::Relation(src_rel) = rel { - assert_eq!(src_rel.spcnode, src_tablespace_id); - assert_eq!(src_rel.dbnode, src_db_id); + for src_rel in rels { + assert_eq!(src_rel.spcnode, src_tablespace_id); + assert_eq!(src_rel.dbnode, src_db_id); - let nblocks = timeline.get_relish_size(rel, req_lsn)?.unwrap_or(0); - let dst_rel = RelTag { - spcnode: tablespace_id, - dbnode: db_id, - relnode: src_rel.relnode, - forknum: src_rel.forknum, - }; + let nblocks = timeline.get_rel_size(src_rel, req_lsn)?; + let dst_rel = RelTag { + spcnode: tablespace_id, + dbnode: db_id, + relnode: src_rel.relnode, + forknum: src_rel.forknum, + }; - // Copy content - for blknum in 0..nblocks { - let content = timeline.get_page_at_lsn(rel, blknum, req_lsn)?; + timeline.put_rel_creation(dst_rel, nblocks)?; - debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); + // Copy content + debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks); + for blknum in 0..nblocks { + debug!("copying block {} from {} to {}", blknum, src_rel, dst_rel); - writer.put_page_image(RelishTag::Relation(dst_rel), blknum, lsn, content)?; - num_blocks_copied += 1; - } - - if nblocks == 0 { - // make sure we have some trace of the relation, even if it's empty - writer.put_truncation(RelishTag::Relation(dst_rel), lsn, 0)?; - } - - num_rels_copied += 1; + let content = timeline.get_rel_page_at_lsn(src_rel, blknum, req_lsn)?; + timeline.put_rel_page_image(dst_rel, blknum, content)?; + num_blocks_copied += 1; } + + num_rels_copied += 1; } // Copy relfilemap - // TODO This implementation is very inefficient - - // it scans all non-rels only to find FileNodeMaps - for tag in timeline.list_nonrels(req_lsn)? { - if let RelishTag::FileNodeMap { spcnode, dbnode } = tag { - if spcnode == src_tablespace_id && dbnode == src_db_id { - let img = timeline.get_page_at_lsn(tag, 0, req_lsn)?; - let new_tag = RelishTag::FileNodeMap { - spcnode: tablespace_id, - dbnode: db_id, - }; - writer.put_page_image(new_tag, 0, lsn, img)?; - break; - } - } - } + let filemap = timeline.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + timeline.put_relmap_file(tablespace_id, db_id, filemap)?; + info!( - "Created database {}/{}, copied {} blocks in {} rels at {}", - tablespace_id, db_id, num_blocks_copied, num_rels_copied, lsn + "Created database {}/{}, copied {} blocks in {} rels", + tablespace_id, db_id, num_blocks_copied, num_rels_copied ); Ok(()) } + fn ingest_xlog_smgr_create( + &mut self, + writer: &mut DatadirTimelineWriter, + rec: &XlSmgrCreate, + ) -> Result<()> { + let rel = RelTag { + spcnode: rec.rnode.spcnode, + dbnode: rec.rnode.dbnode, + relnode: rec.rnode.relnode, + forknum: rec.forknum, + }; + writer.put_rel_creation(rel, 0)?; + Ok(()) + } + /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record. /// /// This is the same logic as in PostgreSQL's smgr_redo() function. fn ingest_xlog_smgr_truncate( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + writer: &mut DatadirTimelineWriter, rec: &XlSmgrTruncate, ) -> Result<()> { let spcnode = rec.rnode.spcnode; @@ -566,7 +537,7 @@ impl WalIngest { relnode, forknum: pg_constants::MAIN_FORKNUM, }; - timeline.put_truncation(RelishTag::Relation(rel), lsn, rec.blkno)?; + self.put_rel_truncation(writer, rel, rec.blkno)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 { let rel = RelTag { @@ -589,7 +560,7 @@ impl WalIngest { info!("Partial truncation of FSM is not supported"); } let num_fsm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_fsm_blocks)?; + self.put_rel_truncation(writer, rel, num_fsm_blocks)?; } if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 { let rel = RelTag { @@ -608,7 +579,7 @@ impl WalIngest { info!("Partial truncation of VM is not supported"); } let num_vm_blocks = 0; - timeline.put_truncation(RelishTag::Relation(rel), lsn, num_vm_blocks)?; + self.put_rel_truncation(writer, rel, num_vm_blocks)?; } Ok(()) } @@ -617,8 +588,7 @@ impl WalIngest { /// fn ingest_xact_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + writer: &mut DatadirTimelineWriter, parsed: &XlXactParsedRecord, is_commit: bool, ) -> Result<()> { @@ -634,12 +604,9 @@ impl WalIngest { // This subxact goes to different page. Write the record // for all the XIDs on the previous page, and continue // accumulating XIDs on this new page. - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + writer.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { ZenithWalRecord::ClogSetCommitted { xids: page_xids } @@ -654,12 +621,9 @@ impl WalIngest { rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; page_xids.push(*subxact); } - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::Clog, - segno, - }, + writer.put_slru_wal_record( + SlruKind::Clog, + segno, rpageno, if is_commit { ZenithWalRecord::ClogSetCommitted { xids: page_xids } @@ -676,22 +640,23 @@ impl WalIngest { dbnode: xnode.dbnode, relnode: xnode.relnode, }; - timeline.drop_relish(RelishTag::Relation(rel), lsn)?; + let last_lsn = self.timeline.get_last_record_lsn(); + if writer.get_rel_exists(rel, last_lsn)? { + self.put_rel_drop(writer, rel)?; + } } } Ok(()) } - fn ingest_clog_truncate_record( + fn ingest_clog_truncate_record( &mut self, - timeline: &T, - writer: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, xlrec: &XlClogTruncate, ) -> Result<()> { info!( - "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {} lsn {}", - xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db, lsn + "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}", + xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); // Here we treat oldestXid and oldestXidDB @@ -722,23 +687,17 @@ impl WalIngest { } // Iterate via SLRU CLOG segments and drop segments that we're ready to truncate - // TODO This implementation is very inefficient - - // it scans all non-rels only to find Clog // // We cannot pass 'lsn' to the Timeline.list_nonrels(), or it // will block waiting for the last valid LSN to advance up to // it. So we use the previous record's LSN in the get calls // instead. - let req_lsn = min(timeline.get_last_record_lsn(), lsn); - for obj in timeline.list_nonrels(req_lsn)? { - if let RelishTag::Slru { slru, segno } = obj { - if slru == SlruKind::Clog { - let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; - if slru_may_delete_clogsegment(segpage, xlrec.pageno) { - writer.drop_relish(RelishTag::Slru { slru, segno }, lsn)?; - trace!("Drop CLOG segment {:>04X} at lsn {}", segno, lsn); - } - } + let req_lsn = timeline.get_last_record_lsn(); + for segno in timeline.list_slru_segments(SlruKind::Clog, req_lsn)? { + let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT; + if slru_may_delete_clogsegment(segpage, xlrec.pageno) { + timeline.drop_slru_segment(SlruKind::Clog, segno)?; + trace!("Drop CLOG segment {:>04X}", segno); } } @@ -747,8 +706,7 @@ impl WalIngest { fn ingest_multixact_create_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, xlrec: &XlMultiXactCreate, ) -> Result<()> { // Create WAL record for updating the multixact-offsets page @@ -756,12 +714,9 @@ impl WalIngest { let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactOffsets, - segno, - }, + timeline.put_slru_wal_record( + SlruKind::MultiXactOffsets, + segno, rpageno, ZenithWalRecord::MultixactOffsetCreate { mid: xlrec.mid, @@ -793,12 +748,9 @@ impl WalIngest { } let n_this_page = this_page_members.len(); - timeline.put_wal_record( - lsn, - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, - }, + timeline.put_slru_wal_record( + SlruKind::MultiXactMembers, + pageno / pg_constants::SLRU_PAGES_PER_SEGMENT, pageno % pg_constants::SLRU_PAGES_PER_SEGMENT, ZenithWalRecord::MultixactMembersCreate { moff: offset, @@ -833,8 +785,7 @@ impl WalIngest { fn ingest_multixact_truncate_record( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, xlrec: &XlMultiXactTruncate, ) -> Result<()> { self.checkpoint.oldestMulti = xlrec.end_trunc_off; @@ -850,13 +801,7 @@ impl WalIngest { // Delete all the segments except the last one. The last segment can still // contain, possibly partially, valid data. while segment != endsegment { - timeline.drop_relish( - RelishTag::Slru { - slru: SlruKind::MultiXactMembers, - segno: segment as u32, - }, - lsn, - )?; + timeline.drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)?; /* move to next segment, handling wraparound correctly */ if segment == maxsegment { @@ -874,22 +819,516 @@ impl WalIngest { fn ingest_relmap_page( &mut self, - timeline: &dyn TimelineWriter, - lsn: Lsn, + timeline: &mut DatadirTimelineWriter, xlrec: &XlRelmapUpdate, decoded: &DecodedWALRecord, ) -> Result<()> { - let tag = RelishTag::FileNodeMap { - spcnode: xlrec.tsid, - dbnode: xlrec.dbid, - }; - let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); // skip xl_relmap_update buf.advance(12); - timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buf[..]))?; + timeline.put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))?; + + Ok(()) + } + + fn put_rel_page_image( + &mut self, + writer: &mut DatadirTimelineWriter, + rel: RelTag, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_rel_extend(writer, rel, blknum)?; + writer.put_rel_page_image(rel, blknum, img)?; + Ok(()) + } + + fn put_rel_wal_record( + &mut self, + writer: &mut DatadirTimelineWriter, + rel: RelTag, + blknum: BlockNumber, + rec: ZenithWalRecord, + ) -> Result<()> { + self.handle_rel_extend(writer, rel, blknum)?; + writer.put_rel_wal_record(rel, blknum, rec)?; + Ok(()) + } + + fn put_rel_truncation( + &mut self, + writer: &mut DatadirTimelineWriter, + rel: RelTag, + nblocks: BlockNumber, + ) -> Result<()> { + writer.put_rel_truncation(rel, nblocks)?; + self.relsize_cache.insert(rel, nblocks); + Ok(()) + } + + fn put_rel_drop(&mut self, writer: &mut DatadirTimelineWriter, rel: RelTag) -> Result<()> { + writer.put_rel_drop(rel)?; + self.relsize_cache.remove(&rel); + Ok(()) + } + + fn handle_rel_extend( + &mut self, + writer: &mut DatadirTimelineWriter, + rel: RelTag, + blknum: BlockNumber, + ) -> Result<()> { + let new_nblocks = blknum + 1; + let old_nblocks = if let Some(nblocks) = self.relsize_cache.get(&rel) { + *nblocks + } else { + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = self.timeline.get_last_record_lsn(); + let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + // create it with 0 size initially, the logic below will extend it + writer.put_rel_creation(rel, 0)?; + 0 + } else { + self.timeline.get_rel_size(rel, last_lsn)? + }; + self.relsize_cache.insert(rel, nblocks); + nblocks + }; + + if new_nblocks > old_nblocks { + //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks); + writer.put_rel_extend(rel, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + writer.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + } + self.relsize_cache.insert(rel, new_nblocks); + } + Ok(()) + } + + fn put_slru_page_image( + &mut self, + writer: &mut DatadirTimelineWriter, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + img: Bytes, + ) -> Result<()> { + self.handle_slru_extend(writer, kind, segno, blknum)?; + writer.put_slru_page_image(kind, segno, blknum, img)?; + Ok(()) + } + + fn handle_slru_extend( + &mut self, + writer: &mut DatadirTimelineWriter, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) -> Result<()> { + // we don't use a cache for this like we do for relations. SLRUS are explcitly + // extended with ZEROPAGE records, not with commit records, so it happens + // a lot less frequently. + + let new_nblocks = blknum + 1; + // Check if the relation exists. We implicitly create relations on first + // record. + // TODO: would be nice if to be more explicit about it + let last_lsn = self.timeline.get_last_record_lsn(); + let old_nblocks = if !self + .timeline + .get_slru_segment_exists(kind, segno, last_lsn)? + { + // create it with 0 size initially, the logic below will extend it + writer.put_slru_segment_creation(kind, segno, 0)?; + 0 + } else { + self.timeline.get_slru_segment_size(kind, segno, last_lsn)? + }; + + if new_nblocks > old_nblocks { + trace!( + "extending SLRU {:?} seg {} from {} to {} blocks", + kind, + segno, + old_nblocks, + new_nblocks + ); + writer.put_slru_extend(kind, segno, new_nblocks)?; + + // fill the gap with zeros + for gap_blknum in old_nblocks..blknum { + writer.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + } + } + Ok(()) + } +} + +/// +/// Tests that should work the same with any Repository/Timeline implementation. +/// +#[allow(clippy::bool_assert_comparison)] +#[cfg(test)] +mod tests { + use super::*; + use crate::pgdatadir_mapping::create_test_timeline; + use crate::repository::repo_harness::*; + use postgres_ffi::pg_constants; + + /// Arbitrary relation tag, for testing. + const TESTREL_A: RelTag = RelTag { + spcnode: 0, + dbnode: 111, + relnode: 1000, + forknum: 0, + }; + + fn assert_current_logical_size(_timeline: &DatadirTimeline, _lsn: Lsn) { + // TODO + } + + static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); + + fn init_walingest_test<'a, R: Repository>( + tline: &'a DatadirTimeline, + ) -> Result> { + let mut writer = tline.begin_record(Lsn(0x10)); + writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; + writer.finish()?; + let walingest = WalIngest::new(tline, Lsn(0x10))?; + + Ok(walingest) + } + + #[test] + fn test_relsize() -> Result<()> { + let repo = RepoHarness::create("test_relsize")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut writer = tline.begin_record(Lsn(0x20)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + writer.finish()?; + let mut writer = tline.begin_record(Lsn(0x30)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))?; + writer.finish()?; + let mut writer = tline.begin_record(Lsn(0x40)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))?; + writer.finish()?; + let mut writer = tline.begin_record(Lsn(0x50)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))?; + writer.finish()?; + + assert_current_logical_size(&tline, Lsn(0x50)); + + // The relation was created at LSN 2, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); + + // FIXME: should error out? + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10))?.is_none()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + + // Check page contents at each LSN + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20))?, + TEST_IMG("foo blk 0 at 2") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30))?, + TEST_IMG("foo blk 0 at 3") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40))?, + TEST_IMG("foo blk 1 at 4") + ); + + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50))?, + TEST_IMG("foo blk 1 at 4") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate last block + let mut writer = tline.begin_record(Lsn(0x60)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, 2)?; + writer.finish()?; + assert_current_logical_size(&tline, Lsn(0x60)); + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60))?, + TEST_IMG("foo blk 0 at 3") + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60))?, + TEST_IMG("foo blk 1 at 4") + ); + + // should still see the truncated block with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, 3); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50))?, + TEST_IMG("foo blk 2 at 5") + ); + + // Truncate to zero length + let mut writer = tline.begin_record(Lsn(0x68)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, 0)?; + writer.finish()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68))?, 0); + + // Extend from 0 to 2 blocks, leaving a gap + let mut writer = tline.begin_record(Lsn(0x70)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 1, TEST_IMG("foo blk 1"))?; + writer.finish()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70))?, 2); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, + ZERO_PAGE + ); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70))?, + TEST_IMG("foo blk 1") + ); + + // Extend a lot more, leaving a big gap that spans across segments + let mut writer = tline.begin_record(Lsn(0x80)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))?; + writer.finish()?; + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, 1501); + for blk in 2..1500 { + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80))?, + ZERO_PAGE + ); + } + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80))?, + TEST_IMG("foo blk 1500") + ); + + Ok(()) + } + + // Test what happens if we dropped a relation + // and then created it again within the same layer. + #[test] + fn test_drop_extend() -> Result<()> { + let repo = RepoHarness::create("test_drop_extend")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut writer = tline.begin_record(Lsn(0x20)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; + writer.finish()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); + + // Drop relish + let mut writer = tline.begin_record(Lsn(0x30)); + walingest.put_rel_drop(&mut writer, TESTREL_A)?; + writer.finish()?; + + // Check that rel is not visible anymore + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false); + + // FIXME: should fail + //assert!(tline.get_rel_size(TESTREL_A, Lsn(0x30))?.is_none()); + + // Re-create it + let mut writer = tline.begin_record(Lsn(0x40)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))?; + writer.finish()?; + + // Check that rel exists and size is correct + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40))?, 1); + + Ok(()) + } + + // Test what happens if we truncated a relation + // so that one of its segments was dropped + // and then extended it again within the same layer. + #[test] + fn test_truncate_extend() -> Result<()> { + let repo = RepoHarness::create("test_truncate_extend")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + //from storage_layer.rs + const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192; + let relsize = RELISH_SEG_SIZE * 2; + + // Create relation with relsize blocks + let mut writer = tline.begin_record(Lsn(0x20)); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, Lsn(0x20)); + walingest.put_rel_page_image(&mut writer, TESTREL_A, blkno, TEST_IMG(&data))?; + } + writer.finish()?; + + // The relation was created at LSN 20, not visible at LSN 1 yet. + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false); + + // FIXME: should fail + // assert!(tline.get_rel_size(TESTREL_A, Lsn(0x10))?.is_none()); + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, relsize); + + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, lsn)?, + TEST_IMG(&data) + ); + } + + // Truncate relation so that second segment was dropped + // - only leave one page + let mut writer = tline.begin_record(Lsn(0x60)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, 1)?; + writer.finish()?; + + // Check reported size and contents after truncation + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60))?, 1); + + for blkno in 0..1 { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60))?, + TEST_IMG(&data) + ); + } + + // should still see all blocks with older LSN + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50))?, relsize); + for blkno in 0..relsize { + let lsn = Lsn(0x20); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50))?, + TEST_IMG(&data) + ); + } + + // Extend relation again. + // Add enough blocks to create second segment + let lsn = Lsn(0x80); + let mut writer = tline.begin_record(lsn); + for blkno in 0..relsize { + let data = format!("foo blk {} at {}", blkno, lsn); + walingest.put_rel_page_image(&mut writer, TESTREL_A, blkno, TEST_IMG(&data))?; + } + writer.finish()?; + + assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true); + assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80))?, relsize); + // Check relation content + for blkno in 0..relsize { + let lsn = Lsn(0x80); + let data = format!("foo blk {} at {}", blkno, lsn); + assert_eq!( + tline.get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80))?, + TEST_IMG(&data) + ); + } + + Ok(()) + } + + /// Test get_relsize() and truncation with a file larger than 1 GB, so that it's + /// split into multiple 1 GB segments in Postgres. + #[test] + fn test_large_rel() -> Result<()> { + let repo = RepoHarness::create("test_large_rel")?.load(); + let tline = create_test_timeline(repo, TIMELINE_ID)?; + let mut walingest = init_walingest_test(&tline)?; + + let mut lsn = 0x10; + for blknum in 0..pg_constants::RELSEG_SIZE + 1 { + lsn += 0x10; + let mut writer = tline.begin_record(Lsn(lsn)); + let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn))); + walingest.put_rel_page_image(&mut writer, TESTREL_A, blknum as BlockNumber, img)?; + writer.finish()?; + } + + assert_current_logical_size(&tline, Lsn(lsn)); + + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE + 1 + ); + + // Truncate one block + lsn += 0x10; + let mut writer = tline.begin_record(Lsn(lsn)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, pg_constants::RELSEG_SIZE)?; + writer.finish()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE + ); + assert_current_logical_size(&tline, Lsn(lsn)); + + // Truncate another block + lsn += 0x10; + let mut writer = tline.begin_record(Lsn(lsn)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, pg_constants::RELSEG_SIZE - 1)?; + writer.finish()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + pg_constants::RELSEG_SIZE - 1 + ); + assert_current_logical_size(&tline, Lsn(lsn)); + + // Truncate to 1500, and then truncate all the way down to 0, one block at a time + // This tests the behavior at segment boundaries + let mut size: i32 = 3000; + while size >= 0 { + lsn += 0x10; + let mut writer = tline.begin_record(Lsn(lsn)); + walingest.put_rel_truncation(&mut writer, TESTREL_A, size as BlockNumber)?; + writer.finish()?; + assert_eq!( + tline.get_rel_size(TESTREL_A, Lsn(lsn))?, + size as BlockNumber + ); + + size -= 1; + } + assert_current_logical_size(&tline, Lsn(lsn)); Ok(()) } diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index abf6bace22..fd318b9cb7 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -7,7 +7,6 @@ use crate::config::PageServerConf; use crate::repository::Repository; -use crate::repository::Timeline; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -255,8 +254,7 @@ fn walreceiver_main( // at risk of hitting a deadlock. assert!(lsn.is_aligned()); - let writer = timeline.writer(); - walingest.ingest_record(&*timeline, writer.as_ref(), recdata, lsn)?; + walingest.ingest_record(&timeline, recdata, lsn)?; fail_point!("walreceiver-after-ingest"); diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index ca9107cdbf..5947a0c147 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti use serde::{Deserialize, Serialize}; use tracing::*; -use crate::repository::ZenithWalRecord; +/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom zenith-specific "record". +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum ZenithWalRecord { + /// Native PostgreSQL WAL record + Postgres { will_init: bool, rec: Bytes }, + + /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) + ClearVisibilityMapFlags { + new_heap_blkno: Option, + old_heap_blkno: Option, + flags: u8, + }, + /// Mark transaction IDs as committed on a CLOG page + ClogSetCommitted { xids: Vec }, + /// Mark transaction IDs as aborted on a CLOG page + ClogSetAborted { xids: Vec }, + /// Extend multixact offsets SLRU + MultixactOffsetCreate { + mid: MultiXactId, + moff: MultiXactOffset, + }, + /// Extend multixact members SLRU. + MultixactMembersCreate { + moff: MultiXactOffset, + members: Vec, + }, +} + +impl ZenithWalRecord { + /// Does replaying this WAL record initialize the page from scratch, or does + /// it need to be applied over the previous image of the page? + pub fn will_init(&self) -> bool { + match self { + ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init, + + // None of the special zenith record types currently initialize the page + _ => false, + } + } +} /// DecodedBkpBlock represents per-page data contained in a WAL record. #[derive(Default)] @@ -87,6 +127,28 @@ impl XlRelmapUpdate { } } +#[repr(C)] +#[derive(Debug)] +pub struct XlSmgrCreate { + pub rnode: RelFileNode, + // FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have + // well-defined size? + pub forknum: u8, +} + +impl XlSmgrCreate { + pub fn decode(buf: &mut Bytes) -> XlSmgrCreate { + XlSmgrCreate { + rnode: RelFileNode { + spcnode: buf.get_u32_le(), /* tablespace */ + dbnode: buf.get_u32_le(), /* database */ + relnode: buf.get_u32_le(), /* relation */ + }, + forknum: buf.get_u32_le() as u8, + } + } +} + #[repr(C)] #[derive(Debug)] pub struct XlSmgrTruncate { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 877b81b8d5..893efc5fba 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock; use zenith_utils::zid::ZTenantId; use crate::config::PageServerConf; +use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block}; use crate::relish::*; -use crate::repository::ZenithWalRecord; +use crate::repository::Key; +use crate::walrecord::ZenithWalRecord; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift; use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset; use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset; @@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync { /// the reords. fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, @@ -92,8 +93,7 @@ pub struct DummyRedoManager {} impl crate::walredo::WalRedoManager for DummyRedoManager { fn request_redo( &self, - _rel: RelishTag, - _blknum: u32, + _key: Key, _lsn: Lsn, _base_img: Option, _records: Vec<(Lsn, ZenithWalRecord)>, @@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool { } } -fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool { - if let RelishTag::Relation(RelTag { - forknum, - spcnode: _, - dbnode: _, - relnode: _, - }) = rel - { - *forknum == expected_forknum - } else { - false - } -} - -fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool { - if let RelishTag::Slru { slru, segno } = rel { - *slru == expected_slru && *segno == expected_segno - } else { - false - } -} - /// An error happened in WAL redo #[derive(Debug, thiserror::Error)] pub enum WalRedoError { @@ -184,6 +162,8 @@ pub enum WalRedoError { InvalidState, #[error("cannot perform WAL redo for this request")] InvalidRequest, + #[error("cannot perform WAL redo for this record")] + InvalidRecord, } /// @@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager { /// fn request_redo( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: Vec<(Lsn, ZenithWalRecord)>, @@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager { if rec_zenith != batch_zenith { let result = if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i]) + self.apply_batch_zenith(key, lsn, img, &records[batch_start..i]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..i], @@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager { } // last batch if batch_zenith { - self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..]) + self.apply_batch_zenith(key, lsn, img, &records[batch_start..]) } else { self.apply_batch_postgres( - rel, - blknum, + key, lsn, img, &records[batch_start..], @@ -268,16 +245,15 @@ impl PostgresRedoManager { /// fn apply_batch_postgres( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: &[(Lsn, ZenithWalRecord)], wal_redo_timeout: Duration, ) -> Result { - let start_time = Instant::now(); + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; - let apply_result: Result; + let start_time = Instant::now(); let mut process_guard = self.process.lock().unwrap(); let lock_time = Instant::now(); @@ -291,16 +267,11 @@ impl PostgresRedoManager { WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64()); - let result = if let RelishTag::Relation(rel) = rel { - // Relational WAL records are applied using wal-redo-postgres - let buf_tag = BufferTag { rel, blknum }; - apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout); - - apply_result.map_err(WalRedoError::IoError) - } else { - error!("unexpected non-relation relish: {:?}", rel); - Err(WalRedoError::InvalidRequest) - }; + // Relational WAL records are applied using wal-redo-postgres + let buf_tag = BufferTag { rel, blknum }; + let result = process + .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout) + .map_err(WalRedoError::IoError); let end_time = Instant::now(); let duration = end_time.duration_since(lock_time); @@ -326,8 +297,7 @@ impl PostgresRedoManager { /// fn apply_batch_zenith( &self, - rel: RelishTag, - blknum: u32, + key: Key, lsn: Lsn, base_img: Option, records: &[(Lsn, ZenithWalRecord)], @@ -346,7 +316,7 @@ impl PostgresRedoManager { // Apply all the WAL records in the batch for (record_lsn, record) in records.iter() { - self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?; + self.apply_record_zenith(key, &mut page, *record_lsn, record)?; } // Success! let end_time = Instant::now(); @@ -365,8 +335,7 @@ impl PostgresRedoManager { fn apply_record_zenith( &self, - rel: RelishTag, - blknum: u32, + key: Key, page: &mut BytesMut, _record_lsn: Lsn, record: &ZenithWalRecord, @@ -382,9 +351,10 @@ impl PostgresRedoManager { flags, } => { // sanity check that this is modifying the correct relish + let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?; assert!( - check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM), - "ClearVisibilityMapFlags record on unexpected rel {:?}", + rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM, + "ClearVisibilityMapFlags record on unexpected rel {}", rel ); if let Some(heap_blkno) = *new_heap_blkno { @@ -418,6 +388,14 @@ impl PostgresRedoManager { // Non-relational WAL records are handled here, with custom code that has the // same effects as the corresponding Postgres WAL redo function. ZenithWalRecord::ClogSetCommitted { xids } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetCommitted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -425,12 +403,17 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetCommitted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetCommitted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status( xid, @@ -440,6 +423,14 @@ impl PostgresRedoManager { } } ZenithWalRecord::ClogSetAborted { xids } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::Clog, + "ClogSetAborted record with unexpected key {}", + key + ); for &xid in xids { let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE; let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -447,17 +438,30 @@ impl PostgresRedoManager { // Check that we're modifying the correct CLOG block. assert!( - check_slru_segno(&rel, SlruKind::Clog, expected_segno), - "ClogSetCommitted record for XID {} with unexpected rel {:?}", + segno == expected_segno, + "ClogSetAborted record for XID {} with unexpected key {}", xid, - rel + key + ); + assert!( + blknum == expected_blknum, + "ClogSetAborted record for XID {} with unexpected key {}", + xid, + key ); - assert!(blknum == expected_blknum); transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page); } } ZenithWalRecord::MultixactOffsetCreate { mid, moff } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactOffsets, + "MultixactOffsetCreate record with unexpected key {}", + key + ); // Compute the block and offset to modify. // See RecordNewMultiXact in PostgreSQL sources. let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; @@ -468,16 +472,29 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno), - "MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", mid, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}", + mid, + key ); - assert!(blknum == expected_blknum); LittleEndian::write_u32(&mut page[offset..offset + 4], *moff); } ZenithWalRecord::MultixactMembersCreate { moff, members } => { + let (slru_kind, segno, blknum) = + key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?; + assert_eq!( + slru_kind, + SlruKind::MultiXactMembers, + "MultixactMembersCreate record with unexpected key {}", + key + ); for (i, member) in members.iter().enumerate() { let offset = moff + i as u32; @@ -492,12 +509,17 @@ impl PostgresRedoManager { let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; assert!( - check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno), - "MultiXactMembersCreate record at offset {} with unexpected rel {:?}", + segno == expected_segno, + "MultiXactMembersCreate record for offset {} with unexpected key {}", moff, - rel + key + ); + assert!( + blknum == expected_blknum, + "MultiXactMembersCreate record for offset {} with unexpected key {}", + moff, + key ); - assert!(blknum == expected_blknum); let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]); flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift); diff --git a/postgres_ffi/src/pg_constants.rs b/postgres_ffi/src/pg_constants.rs index 76f837cefc..7230b841f5 100644 --- a/postgres_ffi/src/pg_constants.rs +++ b/postgres_ffi/src/pg_constants.rs @@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2; pub const INIT_FORKNUM: u8 = 3; // From storage_xlog.h +pub const XLOG_SMGR_CREATE: u8 = 0x10; +pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; + pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001; pub const SMGR_TRUNCATE_VM: u32 = 0x0002; pub const SMGR_TRUNCATE_FSM: u32 = 0x0004; @@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4; // From pg_control.h and rmgrlist.h pub const XLOG_NEXTOID: u8 = 0x30; pub const XLOG_SWITCH: u8 = 0x40; -pub const XLOG_SMGR_TRUNCATE: u8 = 0x20; pub const XLOG_FPI_FOR_HINT: u8 = 0xA0; pub const XLOG_FPI: u8 = 0xB0; pub const DB_SHUTDOWNED: u32 = 1; diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index 236c225bfb..58f7294eb5 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int: def print_gc_result(row): log.info("GC duration {elapsed} ms".format_map(row)) log.info( - " REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}" - .format_map(row)) - log.info( - " NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}" + " total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}" .format_map(row)) From 28045890eb3876fd8f70d771ee8c1d4040b3f843 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 8 Mar 2022 11:06:44 +0200 Subject: [PATCH 04/55] Work on compaction. --- pageserver/src/keyspace.rs | 87 ++++ pageserver/src/layered_repository.rs | 377 +++++++----------- .../src/layered_repository/delta_layer.rs | 15 +- .../src/layered_repository/image_layer.rs | 14 +- .../src/layered_repository/inmemory_layer.rs | 12 +- .../src/layered_repository/layer_map.rs | 43 +- .../src/layered_repository/storage_layer.rs | 12 +- pageserver/src/lib.rs | 1 + pageserver/src/pgdatadir_mapping.rs | 112 ++++-- pageserver/src/relish.rs | 132 +----- pageserver/src/repository.rs | 72 +++- 11 files changed, 398 insertions(+), 479 deletions(-) create mode 100644 pageserver/src/keyspace.rs diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs new file mode 100644 index 0000000000..2b490d6ffe --- /dev/null +++ b/pageserver/src/keyspace.rs @@ -0,0 +1,87 @@ +use std::ops::Range; + +use crate::repository::{Key, key_range_size, singleton_range}; + +use postgres_ffi::pg_constants; + +// in # of key-value pairs +// FIXME Size of one segment in pages (128 MB) +pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; +pub const TARGET_FILE_SIZE: usize = (TARGET_FILE_SIZE_BYTES / 8192) as usize; + +/// +/// Represents a set of Keys, in a compact form. +/// +pub struct KeyPartitioning { + accum: Option>, + + ranges: Vec>, + + pub partitions: Vec>>, +} + +impl KeyPartitioning { + + pub fn new() -> Self { + KeyPartitioning { + accum: None, + ranges: Vec::new(), + partitions: Vec::new(), + } + } + + pub fn add_key(&mut self, key: Key) { + self.add_range(singleton_range(key)) + } + + pub fn add_range(&mut self, range: Range) { + match self.accum.as_mut() { + Some(accum) => { + if range.start == accum.end { + accum.end = range.end; + } else { + self.ranges.push(accum.clone()); + *accum = range; + } + }, + None => self.accum = Some(range), + } + } + + pub fn repartition(&mut self, target_size: u64) { + let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + if let Some(accum) = self.accum.take() { + self.ranges.push(accum); + } + + self.partitions = Vec::new(); + + let mut current_part = Vec::new(); + let mut current_part_size: usize = 0; + for range in &self.ranges { + let this_size = key_range_size(&range) as usize; + + if current_part_size + this_size > target_nblocks && + !current_part.is_empty() + { + self.partitions.push(current_part); + current_part = Vec::new(); + current_part_size = 0; + } + + let mut remain_size = this_size; + let mut start = range.start; + while remain_size > target_nblocks { + let next = start.add(target_nblocks as u32); + self.partitions.push(vec![start..next]); + start = next; + remain_size -= target_nblocks + } + current_part.push(start..range.end); + current_part_size += remain_size; + } + if !current_part.is_empty() { + self.partitions.push(current_part); + } + } +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index e344a79373..5169717818 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -22,7 +22,7 @@ use tracing::*; use std::cmp::{min, max, Ordering}; use std::collections::hash_map::Entry; use std::collections::BTreeSet; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs; use std::fs::{File, OpenOptions}; use std::io::Write; @@ -34,6 +34,7 @@ use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; +use crate::keyspace::KeyPartitioning; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, @@ -68,7 +69,10 @@ use filename::{DeltaFileName, ImageFileName}; use image_layer::{ImageLayer, ImageLayerWriter}; use inmemory_layer::InMemoryLayer; use layer_map::LayerMap; -use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState, TARGET_FILE_SIZE,TARGET_FILE_SIZE_BYTES}; +use layer_map::SearchResult; +use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; + +use crate::keyspace::TARGET_FILE_SIZE_BYTES; // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; @@ -738,6 +742,8 @@ pub struct LayeredTimeline { // garbage collecting data that is still needed by the child timelines. gc_info: RwLock, + partitioning: RwLock>, + // It may change across major versions so for simplicity // keep it after running initdb for a timeline. // It is needed in checks when we want to error on some operations @@ -794,14 +800,11 @@ impl Timeline for LayeredTimeline { debug_assert!(lsn <= self.get_last_record_lsn()); let mut reconstruct_state = ValueReconstructState { - key, - lsn, records: Vec::new(), img: None, // FIXME: check page cache and put the img here - request_lsn: lsn, }; - self.get_reconstruct_data(&mut reconstruct_state)?; + self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; self.reconstruct_value(key, lsn, reconstruct_state) } @@ -823,11 +826,6 @@ impl Timeline for LayeredTimeline { } } - // Entry point for forced image creation. Only used by tests at the moment. - fn create_images(&self, threshold: usize) -> Result<()> { - self.create_image_layers(threshold) - } - /// /// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn. /// @@ -861,6 +859,11 @@ impl Timeline for LayeredTimeline { self.disk_consistent_lsn.load() } + fn hint_partitioning(&self, partitioning: KeyPartitioning) -> Result<()> { + self.partitioning.write().unwrap().replace(partitioning); + Ok(()) + } + fn writer<'a>(&'a self) -> Box { Box::new(LayeredTimelineWriter { tl: self, @@ -909,6 +912,7 @@ impl LayeredTimeline { retain_lsns: Vec::new(), cutoff: Lsn(0), }), + partitioning: RwLock::new(None), latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()), initdb_lsn: metadata.initdb_lsn(), @@ -1002,7 +1006,7 @@ impl LayeredTimeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. - fn get_reconstruct_data(&self, reconstruct_state: &mut ValueReconstructState) -> Result<()> { + fn get_reconstruct_data(&self, key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState) -> Result<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; @@ -1013,6 +1017,7 @@ impl LayeredTimeline { let mut prev_lsn = Lsn(u64::MAX); let mut result = ValueReconstructResult::Continue; + let mut cont_lsn = Lsn(request_lsn.0 + 1); loop { // The function should have updated 'state' @@ -1020,28 +1025,28 @@ impl LayeredTimeline { match result { ValueReconstructResult::Complete => return Ok(()), ValueReconstructResult::Continue => { - if prev_lsn <= reconstruct_state.lsn { + if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. bail!("could not find layer with more data for key {} at LSN {}, request LSN {}", - reconstruct_state.key, - reconstruct_state.lsn, - reconstruct_state.request_lsn) + key, + Lsn(cont_lsn.0 - 1), + request_lsn) } - prev_lsn = reconstruct_state.lsn; + prev_lsn = cont_lsn; } ValueReconstructResult::Missing => { bail!( "could not find data for key {} at LSN {}, for request at LSN {}", - reconstruct_state.key, - reconstruct_state.lsn, - reconstruct_state.request_lsn + key, + cont_lsn, + request_lsn ) } } // Recurse into ancestor if needed - if reconstruct_state.lsn <= timeline.ancestor_lsn { + if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { //info!("going into ancestor {}", timeline.ancestor_lsn); let ancestor = timeline.get_ancestor_timeline()?; timeline_owned = ancestor; @@ -1055,33 +1060,34 @@ impl LayeredTimeline { // Check the open and frozen in-memory layers first if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; - if reconstruct_state.lsn >= start_lsn { + if cont_lsn >= start_lsn { //info!("CHECKING for {} at {} on open layer {}", reconstruct_state.key, reconstruct_state.lsn, open_layer.filename().display()); - result = open_layer.get_value_reconstruct_data(open_layer.get_lsn_range().start, reconstruct_state)?; + result = open_layer.get_value_reconstruct_data(key, open_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; + cont_lsn = open_layer.get_lsn_range().start; continue; } } if let Some(frozen_layer) = &layers.frozen_layer { let start_lsn = frozen_layer.get_lsn_range().start; - if reconstruct_state.lsn >= start_lsn { + if cont_lsn >= start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", reconstruct_state.key, reconstruct_state.lsn, frozen_layer.filename().display()); - result = frozen_layer.get_value_reconstruct_data(frozen_layer.get_lsn_range().start, reconstruct_state)?; + result = frozen_layer.get_value_reconstruct_data(key, frozen_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; + cont_lsn = frozen_layer.get_lsn_range().start; continue; } } - if let Some(search_result) = layers - .search(reconstruct_state.key, reconstruct_state.lsn)? + if let Some(SearchResult { lsn_floor, layer }) = layers + .search(key, cont_lsn)? { //info!("CHECKING for {} at {} on historic layer {}", reconstruct_state.key, reconstruct_state.lsn, layer.filename().display()); - result = search_result - .layer - .get_value_reconstruct_data(search_result.lsn_floor, reconstruct_state)?; + result = layer.get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state)?; + cont_lsn = lsn_floor; } else if self.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; - reconstruct_state.lsn = self.ancestor_lsn; + cont_lsn = Lsn(self.ancestor_lsn.0 + 1); } else { // Nothing found result = ValueReconstructResult::Missing; @@ -1241,9 +1247,9 @@ impl LayeredTimeline { // currently hard-coded at 3. It means, write out a new image layer, // if there are at least three delta layers on top of it. if false { - self.create_image_layers(3)?; + self.compact(TARGET_FILE_SIZE_BYTES as usize)?; } - self.compact_level0()?; + //self.compact_level0()?; } // TODO: We should also compact existing delta layers here. @@ -1357,7 +1363,7 @@ impl LayeredTimeline { Ok(()) } - fn compact(&self) -> Result<()> { + fn compact(&self, target_file_size: usize) -> Result<()> { // // High level strategy for compaction / image creation: // @@ -1392,10 +1398,80 @@ impl LayeredTimeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - todo!() + + let lsn = self.last_record_lsn.load().last; + + // 1. The partitioning was already done by the code in + // pgdatadir_mapping.rs. We just use it here. + let partitioning = self.partitioning.read().unwrap(); + if let Some(partitioning) = partitioning.as_ref() { + // 2. Create new image layers for partitions that have been modified + // "enough". + for partition in &partitioning.partitions { + if self.time_for_new_image_layer(partition, lsn, 3)? { + self.create_image_layer(partition, lsn)?; + } + } + // 3. Compact + self.compact_level0(target_file_size)?; + } else { + info!("Could not compact because no partitioning specified yet"); + } + Ok(()) } - fn compact_level0(&self) -> Result<()> { + // Is it time to create a new image layer for the given partition? + fn time_for_new_image_layer(&self, partition: &Vec>, lsn: Lsn, threshold: usize) -> Result { + let layers = self.layers.lock().unwrap(); + + for part_range in partition { + let image_coverage = layers.image_coverage(&part_range, lsn)?; + for (img_range, last_img) in image_coverage { + let img_lsn = if let Some(ref last_img) = last_img { + last_img.get_lsn_range().end + } else { + Lsn(0) + }; + + let num_deltas = layers.get_deltas(&img_range, &(img_lsn..lsn))?.len(); + + info!( + "range {}-{}, has {} deltas on this timeline", + img_range.start, img_range.end, num_deltas + ); + if num_deltas >= threshold { + return Ok(true); + } + } + } + + Ok(false) + } + + fn create_image_layer(&self, partition: &Vec>, lsn: Lsn) -> Result<()> { + let img_range = partition.first().unwrap().start..partition.last().unwrap().end; + let mut image_layer_writer = + ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; + + for range in partition { + let mut key = range.start; + while key < range.end { + let img = self.get(key, lsn)?; + image_layer_writer.put_image(key, &img)?; + key = key.next(); + } + } + let image_layer = image_layer_writer.finish()?; + + let mut layers = self.layers.lock().unwrap(); + layers.insert_historic(Arc::new(image_layer)); + drop(layers); + // FIXME: need to fsync? + + Ok(()) + } + + fn compact_level0(&self, target_file_size: usize) -> Result<()> { let mut layers = self.layers.lock().unwrap(); // We compact or "shuffle" the level-0 delta layers when 10 have @@ -1433,12 +1509,13 @@ impl LayeredTimeline { }); // Merge the contents of all the input delta layers into a new set - // of delta layers. Each output layer is TARGET_FILE_SIZE_BYTES in - // size, i.e. we don't try to align the layer boundaries with the - // image layers or relation boundaries. TODO: we probably should, - // to allow garbage collection to happen earlier. + // of delta layers, based on the current partitioning. // - // TODO: we should also opportunistically garbage collect what we can. + // TODO: this actually divides the layers into fixed-size chunks, not + // based on the partitioning. + // + // TODO: we should also opportunistically materialize and + // garbage collect what we can. let mut new_layers = Vec::new(); let mut prev_key: Option = None; let mut writer: Option = None; @@ -1448,8 +1525,7 @@ impl LayeredTimeline { if let Some(prev_key) = prev_key { if key != prev_key && writer.is_some() { let size = writer.as_mut().unwrap().size(); - info!("size is now {}", size); - if size > TARGET_FILE_SIZE_BYTES as u64 { + if size > target_file_size as u64 { new_layers.push(writer.take().unwrap().finish(prev_key.next())?); writer = None; } @@ -1487,193 +1563,6 @@ impl LayeredTimeline { Ok(()) } - /// - /// Create new image layers, to allow garbage collection to remove old files. - /// - fn create_image_layers(&self, threshold: usize) -> Result<()> { - let layers = self.layers.lock().unwrap(); - let lsn = self.last_record_lsn.load().last; - let image_coverage = layers.image_coverage(&(Key::MIN..Key::MAX), lsn)?; - drop(layers); - - debug!( - "create_image_layers called with threshold {} at {}", - threshold, lsn - ); - - // For any range where there has been more than 'threshold' - // deltas on top of the last image, create new image. - // - // TODO: Invent a better heuristic. - // - // - // TODO: add heuristics to greedily include more segments in the - // image layer, if it's otherwise very small. - for (key_range, last_img) in image_coverage { - let img_lsn = if let Some(ref last_img) = last_img { - last_img.get_lsn_range().end - } else { - Lsn(0) - }; - - let layers = self.layers.lock().unwrap(); - let num_deltas = layers.get_deltas(&key_range, &(img_lsn..lsn))?.len(); - drop(layers); - - info!( - "range {}-{} has {} deltas on this timeline", - key_range.start, key_range.end, num_deltas - ); - if num_deltas >= threshold { - self.create_image_layers_for_range(&key_range, last_img, lsn)?; - } - } - - Ok(()) - } - - // Get all distinct Keys present in the given key range. - // - // This is used to figure out which parts of the overall keyspace are in use, to - // divide the keyspace into image layers. - // - // TODO: For a large database, this set could be very large. Use ranges or prefixes - // instead of individual keys. - fn collect_keys( - &self, - key_range: &Range, - img: Option>, - lsn: Lsn, - base_keys: &mut HashSet, - delta_keys: &mut HashSet, - ) -> Result<()> { - info!( - "creating image layer for key range {}-{} at {}", - key_range.start, key_range.end, lsn - ); - - let baseline_lsn = if let Some(img) = img { - // This range is covered by an image layer on this timeline. Iterate over all the keys - img.collect_keys(key_range, base_keys)?; - img.get_lsn_range().end - } else if self.ancestor_timeline.is_some() { - // Need to look at the ancestor for this range. - let ancestor = self.get_ancestor_timeline()?; - - ancestor.collect_keys_recurse(key_range, lsn, base_keys)?; - self.ancestor_lsn - } else { - self.initdb_lsn - }; - - // Ok, we have baseline list of keys from the images now - // Add all keys from all the deltas - let deltas = { - let layers = self.layers.lock().unwrap(); - layers.get_deltas(key_range, &(baseline_lsn..lsn))? - }; - - for delta in deltas { - delta.collect_keys(key_range, delta_keys)?; - } - - Ok(()) - } - - fn collect_keys_recurse( - &self, - key_range: &Range, - lsn: Lsn, - keys: &mut HashSet, - ) -> Result<()> { - let layers = self.layers.lock().unwrap(); - let image_coverage = layers.image_coverage(key_range, lsn)?; - drop(layers); - - for (range, last_img) in image_coverage { - let mut tmp_keys = HashSet::new(); - self.collect_keys(&range, last_img, lsn, keys, &mut tmp_keys)?; - keys.extend(tmp_keys); - } - - Ok(()) - } - - /// Create a new set of image layers for the given key range. - fn create_image_layers_for_range( - &self, - key_range: &Range, - img: Option>, - lsn: Lsn, - ) -> Result<()> { - info!( - "creating image layer for {}-{} at {}", - key_range.start, key_range.end, lsn - ); - - // If this gets called multiple times in a row, it's possible that the - // image layer already exists. - let layers = self.layers.lock().unwrap(); - if layers.image_exists(key_range, lsn) { - info!( - "skipping creation of image layer for {}-{} at {} because it already exists", - key_range.start, key_range.end, lsn - ); - return Ok(()); - } - drop(layers); - - let mut base_keys: HashSet = HashSet::new(); - let mut delta_keys: HashSet = HashSet::new(); - self.collect_keys(key_range, img, lsn, &mut base_keys, &mut delta_keys)?; - - if delta_keys.is_empty() { - // Important special case: even though there was delta layers on top of this - // key range, the delta layers didn't contain any updates within the range. - // In that case, if we wrote a new image, it would have identical contents, - // just stamped at a later LSN. Not much point in that. - return Ok(()); - } - - // Divide the key range into roughly TARGET_FILE_SIZE chunks - let mut all_keys_vec: Vec = - base_keys.iter().chain(delta_keys.iter()).cloned().collect(); - all_keys_vec.sort(); - all_keys_vec.dedup(); - - let mut start_idx = 0; - let mut start_key = key_range.start; - while start_idx < all_keys_vec.len() { - let end_idx = std::cmp::min(start_idx + TARGET_FILE_SIZE as usize, all_keys_vec.len()); - let end_key = if end_idx >= all_keys_vec.len() { - key_range.end - } else { - all_keys_vec[end_idx] - }; - - let img_range = start_key..end_key; - - let mut image_layer_writer = - ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; - - for key in all_keys_vec[start_idx..end_idx].iter() { - let img = self.get(*key, lsn)?; - image_layer_writer.put_image(*key, &img)?; - } - let image_layer = image_layer_writer.finish()?; - - let mut layers = self.layers.lock().unwrap(); - layers.insert_historic(Arc::new(image_layer)); - drop(layers); - // FIXME: need to fsync? - - start_idx = end_idx; - start_key = end_key; - } - - Ok(()) - } - /// /// Garbage collect layer files on a timeline that are no longer needed. /// @@ -2000,6 +1889,8 @@ mod tests { Ok(()) } + const TEST_FILE_SIZE: usize = 4 * 1024 * 1024; + #[test] fn test_images() -> Result<()> { let repo = RepoHarness::create("test_images")?.load(); @@ -2014,7 +1905,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.create_images(1)?; + tline.compact(TEST_FILE_SIZE)?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; @@ -2022,7 +1913,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.create_images(1)?; + tline.compact(TEST_FILE_SIZE)?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; @@ -2030,7 +1921,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.create_images(1)?; + tline.compact(TEST_FILE_SIZE)?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; @@ -2038,7 +1929,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.create_images(1)?; + tline.compact(TEST_FILE_SIZE)?; assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); @@ -2074,7 +1965,7 @@ mod tests { blknum += 1; } tline.checkpoint(CheckpointConfig::Forced)?; - //tline.create_images(1)?; + tline.compact(TEST_FILE_SIZE)?; } Ok(()) @@ -2085,12 +1976,15 @@ mod tests { let repo = RepoHarness::create("test_random_updates")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - const NUM_KEYS: usize = 20000; + const NUM_KEYS: usize = 1000; let mut lsn = Lsn(0x10); let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); let mut blknum = 0; + + let mut parts = KeyPartitioning::new(); + for _ in 0..NUM_KEYS { test_key.field6 = blknum; let writer = tline.writer(); @@ -2102,13 +1996,18 @@ mod tests { writer.advance_last_record_lsn(lsn); drop(writer); + parts.add_key(test_key); + lsn = Lsn(lsn.0 + 0x10); blknum += 1; } - for _ in 0..100 { + parts.repartition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts)?; + + for _ in 0..50 { for _ in 0..NUM_KEYS { - blknum = thread_rng().gen_range(0..10000); + blknum = thread_rng().gen_range(0..NUM_KEYS) as u32; test_key.field6 = blknum; let writer = tline.writer(); writer.put( @@ -2125,7 +2024,7 @@ mod tests { let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; - tline.create_images(3)?; + tline.compact(TEST_FILE_SIZE)?; tline.gc()?; } diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index e7d916d1e4..3c5acbc53a 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -150,13 +150,15 @@ impl Layer for DeltaLayer { fn get_value_reconstruct_data( &self, - lsn_floor: Lsn, + key: Key, + lsn_range: Range, reconstruct_state: &mut ValueReconstructState, ) -> Result { let mut need_image = true; - assert!(self.key_range.contains(&reconstruct_state.key)); + assert!(self.key_range.contains(&key)); + /* FIXME match &reconstruct_state.img { Some((cached_lsn, _)) if &self.lsn_range.end <= cached_lsn => { reconstruct_state.lsn = *cached_lsn; @@ -164,6 +166,7 @@ impl Layer for DeltaLayer { } _ => {} } + */ { // Open the file and lock the metadata in memory @@ -175,16 +178,17 @@ impl Layer for DeltaLayer { .chapter_reader(VALUES_CHAPTER)?; // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&reconstruct_state.key) { - let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn); + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { + /* FIXME match &reconstruct_state.img { Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - reconstruct_state.lsn = *cached_lsn; return Ok(ValueReconstructResult::Complete); } _ => {} } + */ let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?; match val { @@ -211,7 +215,6 @@ impl Layer for DeltaLayer { // If an older page image is needed to reconstruct the page, let the // caller know. if need_image { - reconstruct_state.lsn = Lsn(self.lsn_range.start.0 - 1); Ok(ValueReconstructResult::Continue) } else { Ok(ValueReconstructResult::Complete) diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index c13293105a..a8772632ab 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -130,13 +130,14 @@ impl Layer for ImageLayer { /// Look up given page in the file fn get_value_reconstruct_data( &self, - lsn_floor: Lsn, + key: Key, + lsn_range: Range, reconstruct_state: &mut ValueReconstructState, ) -> Result { - assert!(lsn_floor <= self.lsn); - assert!(self.key_range.contains(&reconstruct_state.key)); - assert!(reconstruct_state.lsn >= self.lsn); + assert!(self.key_range.contains(&key)); + assert!(lsn_range.end >= self.lsn); + /* FIXME match reconstruct_state.img { Some((cached_lsn, _)) if self.lsn <= cached_lsn => { reconstruct_state.lsn = cached_lsn; @@ -144,10 +145,11 @@ impl Layer for ImageLayer { } _ => {} } + */ let inner = self.load()?; - if let Some(offset) = inner.index.get(&reconstruct_state.key) { + if let Some(offset) = inner.index.get(&key) { let chapter = inner .book .as_ref() @@ -164,10 +166,8 @@ impl Layer for ImageLayer { let value = Bytes::from(blob); reconstruct_state.img = Some((self.lsn, value)); - reconstruct_state.lsn = self.lsn; Ok(ValueReconstructResult::Complete) } else { - reconstruct_state.lsn = self.lsn; Ok(ValueReconstructResult::Missing) } } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index c3ca2fd091..b6f06d143b 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -121,17 +121,18 @@ impl Layer for InMemoryLayer { /// Look up given value in the layer. fn get_value_reconstruct_data( &self, - lsn_floor: Lsn, + key: Key, + lsn_range: Range, reconstruct_state: &mut ValueReconstructState, ) -> Result { - assert!(lsn_floor <= self.start_lsn); + assert!(lsn_range.start <= self.start_lsn); let mut need_image = true; let inner = self.inner.read().unwrap(); // Scan the page versions backwards, starting from `lsn`. - if let Some(vec_map) = inner.index.get(&reconstruct_state.key) { - let slice = vec_map.slice_range(lsn_floor..=reconstruct_state.lsn); + if let Some(vec_map) = inner.index.get(&key) { + let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { match &reconstruct_state.img { Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { @@ -144,8 +145,6 @@ impl Layer for InMemoryLayer { match value { Value::Image(img) => { reconstruct_state.img = Some((*entry_lsn, img)); - - reconstruct_state.lsn = *entry_lsn; return Ok(ValueReconstructResult::Complete); } Value::WalRecord(rec) => { @@ -166,7 +165,6 @@ impl Layer for InMemoryLayer { // If an older page image is needed to reconstruct the page, let the // caller know. if need_image { - reconstruct_state.lsn = Lsn(self.start_lsn.0 - 1); Ok(ValueReconstructResult::Continue) } else { Ok(ValueReconstructResult::Complete) diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 1c8a4c6860..bd687b54ab 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -66,7 +66,7 @@ pub struct SearchResult { } impl LayerMap { - pub fn search(&self, key: Key, lsn: Lsn) -> Result> { + pub fn search(&self, key: Key, end_lsn: Lsn) -> Result> { // linear search // Find the latest image layer that covers the given key let mut latest_img: Option> = None; @@ -80,15 +80,15 @@ impl LayerMap { } let img_lsn = l.get_lsn_range().start; - if img_lsn > lsn { + if img_lsn >= end_lsn { // too new continue; } - if img_lsn == lsn { + if Lsn(img_lsn.0 + 1) == end_lsn { // found exact match return Ok(Some(SearchResult { layer: Arc::clone(l), - lsn_floor: lsn, + lsn_floor: img_lsn, })); } if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) { @@ -107,19 +107,19 @@ impl LayerMap { continue; } - if l.get_lsn_range().start > lsn { + if l.get_lsn_range().start >= end_lsn { // too new continue; } - if l.get_lsn_range().end > lsn { + if l.get_lsn_range().end >= end_lsn { // this layer contains the requested point in the key/lsn space. // No need to search any further - info!( + trace!( "found layer {} for request on {} at {}", l.filename().display(), key, - lsn + end_lsn ); latest_delta.replace(Arc::clone(l)); break; @@ -135,40 +135,33 @@ impl LayerMap { } } if let Some(l) = latest_delta { - info!( + trace!( "found (old) layer {} for request on {} at {}", l.filename().display(), key, - lsn + end_lsn ); + let lsn_floor = if let Some(latest_img_lsn) = latest_img_lsn { + Lsn(latest_img_lsn.0 + 1) + } else { + l.get_lsn_range().start + }; Ok(Some(SearchResult { - lsn_floor: latest_img_lsn.unwrap_or(l.get_lsn_range().start), + lsn_floor, layer: l, })) } else if let Some(l) = latest_img { - info!("found img layer and no deltas for request on {} at {}", key, lsn); + trace!("found img layer and no deltas for request on {} at {}", key, end_lsn); Ok(Some(SearchResult { lsn_floor: latest_img_lsn.unwrap(), layer: l, })) } else { - info!("no layer found for request on {} at {}", key, lsn); + trace!("no layer found for request on {} at {}", key, end_lsn); Ok(None) } } - pub fn image_exists(&self, key_range: &Range, lsn: Lsn) -> bool { - for l in self.historic_layers.iter() { - if !l.is_incremental() - && l.get_key_range() == *key_range - && l.get_lsn_range().start == lsn - { - return true; - } - } - false - } - /// /// Insert an on-disk layer /// diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index eb31a9c415..c64a472a36 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -13,11 +13,6 @@ use std::path::PathBuf; use zenith_utils::lsn::Lsn; -// in # of key-value pairs -// FIXME Size of one segment in pages (128 MB) -pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; -pub const TARGET_FILE_SIZE: u32 = (TARGET_FILE_SIZE_BYTES / 8192) as u32; - pub fn range_overlaps(a: &Range, b: &Range) -> bool where T: PartialOrd, @@ -49,12 +44,8 @@ where /// #[derive(Debug)] pub struct ValueReconstructState { - pub key: Key, - pub lsn: Lsn, pub records: Vec<(Lsn, ZenithWalRecord)>, pub img: Option<(Lsn, Bytes)>, - - pub request_lsn: Lsn, // original request's LSN, for debugging purposes } /// Return value from Layer::get_page_reconstruct_data @@ -117,7 +108,8 @@ pub trait Layer: Send + Sync { /// collect more data. fn get_value_reconstruct_data( &self, - lsn_floor: Lsn, + key: Key, + lsn_range: Range, reconstruct_data: &mut ValueReconstructState, ) -> Result; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index af4548ebe9..e617316cda 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -3,6 +3,7 @@ pub mod branches; pub mod config; pub mod http; pub mod import_datadir; +pub mod keyspace; pub mod layered_repository; pub mod page_cache; pub mod page_service; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 829533d65f..1c1f37ef82 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,6 +7,7 @@ //! Clarify that) //! +use crate::keyspace::{KeyPartitioning, TARGET_FILE_SIZE_BYTES}; use crate::relish::*; use crate::repository::*; use crate::repository::{Repository, Timeline}; @@ -30,11 +31,12 @@ where R: Repository, { pub tline: Arc, + pub last_partitioning: Option, } #[derive(Debug, Serialize, Deserialize)] pub struct DbDirectory { - // (dbnode, spcnode) + // (spcnode, dbnode) dbs: HashSet<(Oid, Oid)>, } @@ -67,7 +69,10 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); impl DatadirTimeline { pub fn new(tline: Arc) -> Self { - DatadirTimeline { tline } + DatadirTimeline { + tline, + last_partitioning: None, + } } //------------------------------------------------------------------------------ @@ -157,7 +162,7 @@ impl DatadirTimeline { } /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: u32, dbnode: u32, lsn: Lsn) -> Result> { + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { // fetch directory listing let key = rel_dir_to_key(spcnode, dbnode); let buf = self.tline.get(key, lsn)?; @@ -292,6 +297,65 @@ impl DatadirTimeline { //todo!() Ok(0) } + + fn collect_keyspace(&self, lsn: Lsn) -> Result { + // Iterate through key ranges, greedily packing them into partitions + let mut result = KeyPartitioning::new(); + + // Add dbdir + result.add_key(DBDIR_KEY); + + // Fetch list of database dirs and iterate them + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut dbs: Vec<(Oid, Oid)> = dbdir.dbs.iter().cloned().collect(); + dbs.sort(); + for (spcnode, dbnode) in dbs { + result.add_key(relmap_file_key(spcnode, dbnode)); + let mut rels: Vec = self.list_rels(spcnode, dbnode, lsn)?.iter().cloned().collect(); + rels.sort(); + for rel in rels { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.tline.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize)); + result.add_key(relsize_key); + } + } + + // Iterate SLRUs next + for kind in [SlruKind::Clog, SlruKind:: MultiXactMembers, SlruKind::MultiXactOffsets] { + let slrudir_key = slru_dir_to_key(kind); + let buf = self.tline.get(slrudir_key, lsn)?; + let dir = SlruSegmentDirectory::des(&buf)?; + let mut segments: Vec = dir.segments.iter().cloned().collect(); + segments.sort(); + for segno in segments { + let segsize_key = slru_segment_size_to_key(kind, segno); + let mut buf = self.tline.get(segsize_key, lsn)?; + let segsize = buf.get_u32_le(); + + result.add_range(slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize)); + result.add_key(segsize_key); + } + } + + // Then pg_twophase + let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; + let twophase_dir = TwoPhaseDirectory::des(&buf)?; + let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); + xids.sort(); + for xid in xids { + result.add_key(twophase_file_key(xid)); + } + + result.add_key(CONTROLFILE_KEY); + result.add_key(CHECKPOINT_KEY); + + Ok(result) + } } pub struct DatadirTimelineWriter<'a, R: Repository> { @@ -628,6 +692,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn finish(self) -> Result<()> { let writer = self.tline.tline.writer(); + let last_partitioning = self.last_partitioning.unwrap_or(Lsn(0)); + for (key, value) in self.pending_updates { writer.put(key, self.lsn, value)?; } @@ -637,6 +703,12 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { writer.advance_last_record_lsn(self.lsn); + if self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 { + let mut partitioning = self.tline.collect_keyspace(self.lsn)?; + partitioning.repartition(TARGET_FILE_SIZE_BYTES); + self.tline.tline.hint_partitioning(partitioning)?; + } + Ok(()) } @@ -971,40 +1043,6 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { }) } -pub fn key_to_relish_block(key: Key) -> Result<(RelishTag, BlockNumber)> { - // FIXME: there's got to be a bitfields crate or something out there to do this for us.. - - // This only works for keys for blocks that are handled by WalRedo manager. - // TODO: assert that the other fields are zero - - Ok(match key.field1 { - 0x00 => ( - RelishTag::Relation(RelTag { - spcnode: key.field2, - dbnode: key.field3, - relnode: key.field4, - forknum: key.field5, - }), - key.field6, - ), - - 0x01 => ( - RelishTag::Slru { - slru: match key.field2 { - 0x00 => SlruKind::Clog, - 0x01 => SlruKind::MultiXactMembers, - 0x02 => SlruKind::MultiXactOffsets, - _ => bail!("unrecognized slru kind 0x{:02x}", key.field2), - }, - segno: key.field4, - }, - key.field6, - ), - - _ => bail!("unrecognized value kind 0x{:02x}", key.field1), - }) -} - pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { Key { field1: 0x00, diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs index 9228829aef..7304c41a0e 100644 --- a/pageserver/src/relish.rs +++ b/pageserver/src/relish.rs @@ -1,4 +1,6 @@ //! +//! FIXME: relishes are obsolete +//! //! Zenith stores PostgreSQL relations, and some other files, in the //! repository. The relations (i.e. tables and indexes) take up most //! of the space in a typical installation, while the other files are @@ -27,107 +29,7 @@ use serde::{Deserialize, Serialize}; use std::fmt; use postgres_ffi::relfile_utils::forknumber_to_name; -use postgres_ffi::{Oid, TransactionId}; - -/// -/// RelishTag identifies one relish. -/// -#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] -pub enum RelishTag { - // Relations correspond to PostgreSQL relation forks. Each - // PostgreSQL relation fork is considered a separate relish. - Relation(RelTag), - - // SLRUs include pg_clog, pg_multixact/members, and - // pg_multixact/offsets. There are other SLRUs in PostgreSQL, but - // they don't need to be stored permanently (e.g. pg_subtrans), - // or we do not support them in zenith yet (pg_commit_ts). - // - // These are currently never requested directly by the compute - // nodes, although in principle that would be possible. However, - // when a new compute node is created, these are included in the - // tarball that we send to the compute node to initialize the - // PostgreSQL data directory. - // - // Each SLRU segment in PostgreSQL is considered a separate - // relish. For example, pg_clog/0000, pg_clog/0001, and so forth. - // - // SLRU segments are divided into blocks, like relations. - Slru { slru: SlruKind, segno: u32 }, - - // Miscellaneous other files that need to be included in the - // tarball at compute node creation. These are non-blocky, and are - // expected to be small. - - // - // FileNodeMap represents PostgreSQL's 'pg_filenode.map' - // files. They are needed to map catalog table OIDs to filenode - // numbers. Usually the mapping is done by looking up a relation's - // 'relfilenode' field in the 'pg_class' system table, but that - // doesn't work for 'pg_class' itself and a few other such system - // relations. See PostgreSQL relmapper.c for details. - // - // Each database has a map file for its local mapped catalogs, - // and there is a separate map file for shared catalogs. - // - // These files are always 512 bytes long (although we don't check - // or care about that in the page server). - // - FileNodeMap { spcnode: Oid, dbnode: Oid }, - - // - // State files for prepared transactions (e.g pg_twophase/1234) - // - TwoPhase { xid: TransactionId }, - - // The control file, stored in global/pg_control - ControlFile, - - // Special entry that represents PostgreSQL checkpoint. It doesn't - // correspond to to any physical file in PostgreSQL, but we use it - // to track fields needed to restore the checkpoint data in the - // control file, when a compute node is created. - Checkpoint, -} - -impl RelishTag { - pub const fn is_blocky(&self) -> bool { - match self { - // These relishes work with blocks - RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true, - - // and these don't - RelishTag::FileNodeMap { - spcnode: _, - dbnode: _, - } - | RelishTag::TwoPhase { xid: _ } - | RelishTag::ControlFile - | RelishTag::Checkpoint => false, - } - } - - // Physical relishes represent files and use - // RelationSizeEntry to track existing and dropped files. - // They can be both blocky and non-blocky. - pub const fn is_physical(&self) -> bool { - match self { - // These relishes represent physical files - RelishTag::Relation(_) - | RelishTag::Slru { .. } - | RelishTag::FileNodeMap { .. } - | RelishTag::TwoPhase { .. } => true, - - // and these don't - RelishTag::ControlFile | RelishTag::Checkpoint => false, - } - } - - // convenience function to check if this relish is a normal relation. - pub const fn is_relation(&self) -> bool { - matches!(self, RelishTag::Relation(_)) - } -} +use postgres_ffi::Oid; /// /// Relation data file segment id throughout the Postgres cluster. @@ -170,34 +72,6 @@ impl fmt::Display for RelTag { } } -/// Display RelTag in the same format that's used in most PostgreSQL debug messages: -/// -/// //[_fsm|_vm|_init] -/// -impl fmt::Display for RelishTag { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - RelishTag::Relation(rel) => rel.fmt(f), - RelishTag::Slru { slru, segno } => { - // e.g. pg_clog/0001 - write!(f, "{}/{:04X}", slru.to_str(), segno) - } - RelishTag::FileNodeMap { spcnode, dbnode } => { - write!(f, "relmapper file for spc {} db {}", spcnode, dbnode) - } - RelishTag::TwoPhase { xid } => { - write!(f, "pg_twophase/{:08X}", xid) - } - RelishTag::ControlFile => { - write!(f, "control file") - } - RelishTag::Checkpoint => { - write!(f, "checkpoint") - } - } - } -} - /// /// Non-relation transaction status files (clog (a.k.a. pg_xact) and /// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer, diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index b1aedd984a..ce7bfd5013 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -1,3 +1,4 @@ +use crate::keyspace::KeyPartitioning; use crate::walrecord::ZenithWalRecord; use crate::CheckpointConfig; use anyhow::{bail, Result}; @@ -27,26 +28,30 @@ pub struct Key { impl Key { pub fn next(&self) -> Key { + self.add(1) + } + + pub fn add(&self, x: u32) -> Key { let mut key = self.clone(); - let x = key.field6.overflowing_add(1); - key.field6 = x.0; - if x.1 { - let x = key.field5.overflowing_add(1); - key.field5 = x.0; - if x.1 { - let x = key.field4.overflowing_add(1); - key.field4 = x.0; - if x.1 { - let x = key.field3.overflowing_add(1); - key.field3 = x.0; - if x.1 { - let x = key.field2.overflowing_add(1); - key.field2 = x.0; - if x.1 { - let x = key.field1.overflowing_add(1); - key.field1 = x.0; - assert!(!x.1); + let r = key.field6.overflowing_add(x); + key.field6 = r.0; + if r.1 { + let r = key.field5.overflowing_add(1); + key.field5 = r.0; + if r.1 { + let r = key.field4.overflowing_add(1); + key.field4 = r.0; + if r.1 { + let r = key.field3.overflowing_add(1); + key.field3 = r.0; + if r.1 { + let r = key.field2.overflowing_add(1); + key.field2 = r.0; + if r.1 { + let r = key.field1.overflowing_add(1); + key.field1 = r.0; + assert!(!r.1); } } } @@ -56,6 +61,35 @@ impl Key { } } + + +pub fn key_range_size(key_range: &Range) -> u32 { + let start = key_range.start; + let end = key_range.end; + + if end.field1 != start.field1 || + end.field2 != start.field2 || + end.field3 != start.field3 || + end.field4 != start.field4 + { + return u32::MAX; + } + + let start = (start.field5 as u64) << 32 | start.field6 as u64; + let end = (end.field5 as u64) << 32 | end.field6 as u64; + + let diff = end - start; + if diff > u32::MAX as u64 { + u32::MAX + } else { + diff as u32 + } +} + +pub fn singleton_range(key: Key) -> Range { + key..key.next() +} + impl fmt::Display for Key { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( @@ -323,7 +357,7 @@ pub trait Timeline: Send + Sync { /// know anything about them here in the repository. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; - fn create_images(&self, threshold: usize) -> Result<()>; + fn hint_partitioning(&self, partitioning: KeyPartitioning) -> Result<()>; /// /// Check that it is valid to request operations with that lsn. From 798ff26fb0f1c381d022b041b322575ae1e6081b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 8 Mar 2022 13:27:42 +0200 Subject: [PATCH 05/55] More work on compaction, and resurrect some unit tests --- pageserver/src/keyspace.rs | 1 + pageserver/src/layered_repository.rs | 25 +- .../src/layered_repository/layer_map.rs | 21 +- .../src/layered_repository/storage_layer.rs | 7 + pageserver/src/repository.rs | 231 ++++++++++-------- 5 files changed, 179 insertions(+), 106 deletions(-) diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 2b490d6ffe..4e1ebdd32a 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -12,6 +12,7 @@ pub const TARGET_FILE_SIZE: usize = (TARGET_FILE_SIZE_BYTES / 8192) as usize; /// /// Represents a set of Keys, in a compact form. /// +#[derive(Debug, Clone)] pub struct KeyPartitioning { accum: Option>, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5169717818..cd1b723651 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1433,7 +1433,7 @@ impl LayeredTimeline { Lsn(0) }; - let num_deltas = layers.get_deltas(&img_range, &(img_lsn..lsn))?.len(); + let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?; info!( "range {}-{}, has {} deltas on this timeline", @@ -1889,6 +1889,10 @@ mod tests { Ok(()) } + // Target file size in the unit tests. In production, the target + // file size is much larger, maybe 1 GB. But a small size makes it + // much faster to exercise all the logic for creating the files, + // garbage collection, compaction etc. const TEST_FILE_SIZE: usize = 4 * 1024 * 1024; #[test] @@ -1940,6 +1944,10 @@ mod tests { Ok(()) } + // + // Insert a bunch of key-value pairs with increasing keys, checkpoint, + // repeat 100 times. + // #[test] fn test_bulk_insert() -> Result<()> { let repo = RepoHarness::create("test_bulk_insert")?.load(); @@ -1947,10 +1955,12 @@ mod tests { let mut lsn = Lsn(0x10); + let mut parts = KeyPartitioning::new(); + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); let mut blknum = 0; - for _ in 1..100 { - for _ in 1..10000 { + for _ in 0..50 { + for _ in 0..1000 { test_key.field6 = blknum; let writer = tline.writer(); writer.put( @@ -1961,11 +1971,20 @@ mod tests { writer.advance_last_record_lsn(lsn); drop(writer); + parts.add_key(test_key); + lsn = Lsn(lsn.0 + 0x10); blknum += 1; } + + let cutoff = tline.get_last_record_lsn(); + parts.repartition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts.clone())?; + + tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; tline.compact(TEST_FILE_SIZE)?; + tline.gc()?; } Ok(()) diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index bd687b54ab..1ad6351539 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -10,7 +10,7 @@ //! corresponding files are written to disk. //! -use crate::layered_repository::storage_layer::range_overlaps; +use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::layered_repository::storage_layer::Layer; use crate::layered_repository::InMemoryLayer; use crate::repository::Key; @@ -331,12 +331,12 @@ impl LayerMap { Ok(ranges) } - pub fn get_deltas( + pub fn count_deltas( &self, key_range: &Range, lsn_range: &Range, - ) -> Result>> { - let mut deltas = Vec::new(); + ) -> Result { + let mut result = 0; for l in self.historic_layers.iter() { if !l.is_incremental() { continue; @@ -347,9 +347,18 @@ impl LayerMap { if !range_overlaps(&l.get_key_range(), key_range) { continue; } - deltas.push(Arc::clone(l)); + + // We ignore level0 delta layers. Unless the whole keyspace fits + // into one partition + if !range_eq(key_range, &(Key::MIN..Key::MAX)) && + range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) + { + continue; + } + + result += 1; } - Ok(deltas) + Ok(result) } pub fn get_level0_deltas(&self) -> Result>> { diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index c64a472a36..7ddaf7a581 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -24,6 +24,13 @@ where } } +pub fn range_eq(a: &Range, b: &Range) -> bool +where + T: PartialEq, +{ + a.start == b.start && a.end == b.end +} + /// FIXME /// Struct used to communicate across calls to 'get_page_reconstruct_data'. /// diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index ce7bfd5013..1abf2c365a 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -59,6 +59,17 @@ impl Key { } key } + + pub fn from_array(b: [u8; 18]) -> Self { + Key { + field1: b[0], + field2: u32::from_be_bytes(b[1..5].try_into().unwrap()), + field3: u32::from_be_bytes(b[5..9].try_into().unwrap()), + field4: u32::from_be_bytes(b[9..13].try_into().unwrap()), + field5: b[13], + field6: u32::from_be_bytes(b[14..18].try_into().unwrap()), + } + } } @@ -521,28 +532,31 @@ mod tests { //use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT}; //use std::sync::Arc; use bytes::BytesMut; + use hex_literal::hex; + use lazy_static::lazy_static; + + lazy_static! { + static ref TEST_KEY: Key = Key::from_array(hex!("112222222233333333444444445500000001")); + } #[test] fn test_basic() -> Result<()> { let repo = RepoHarness::create("test_basic")?.load(); let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - #[allow(non_snake_case)] - let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap(); - let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; + writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; writer.advance_last_record_lsn(Lsn(0x10)); drop(writer); let writer = tline.writer(); - writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; + writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; writer.advance_last_record_lsn(Lsn(0x20)); drop(writer); - assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); - assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); + assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20")); Ok(()) } @@ -610,123 +624,146 @@ mod tests { Ok(()) } - /* // FIXME: Garbage collection is broken - #[test] - fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load_page_repo(); + fn make_some_layers(tline: &T, start_lsn: Lsn) -> Result<()> { + let mut lsn = start_lsn; + #[allow(non_snake_case)] + { + let writer = tline.writer(); + // Create a relation on the timeline + writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.advance_last_record_lsn(lsn); + lsn += 0x10; + writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.advance_last_record_lsn(lsn); + lsn += 0x10; + } + tline.checkpoint(CheckpointConfig::Forced)?; + { + let writer = tline.writer(); + writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.advance_last_record_lsn(lsn); + lsn += 0x10; + writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.advance_last_record_lsn(lsn); + } + tline.checkpoint(CheckpointConfig::Forced) + } - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + #[test] + fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + // FIXME: this doesn't actually remove any layer currently, given how the checkpointing + // and compaction works. But it does set the 'cutoff' point so that the cross check + // below should fail. + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - // try to branch at lsn 25, should fail because we already garbage collected the data - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(err.to_string().contains("invalid branch start lsn")); - assert!(err + // try to branch at lsn 25, should fail because we already garbage collected the data + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(err.to_string().contains("invalid branch start lsn")); + assert!(err .source() .unwrap() .to_string() .contains("we might've already garbage collected needed data")) - } } - - Ok(()) } - #[test] - fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { - let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load_page_repo(); + Ok(()) + } - repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; - // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 - match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { - Ok(_) => panic!("branching should have failed"), - Err(err) => { - assert!(&err.to_string().contains("invalid branch start lsn")); - assert!(&err + #[test] + fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> { + let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load(); + + repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?; + // try to branch at lsn 0x25, should fail because initdb lsn is 0x50 + match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) { + Ok(_) => panic!("branching should have failed"), + Err(err) => { + assert!(&err.to_string().contains("invalid branch start lsn")); + assert!(&err .source() .unwrap() .to_string() .contains("is earlier than latest GC horizon")); - } } - - Ok(()) } - #[test] - fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> { - let repo = - RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")? - .load_page_repo(); + Ok(()) + } - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + /* + // FIXME: This currently fails to error out. Calling GC doesn't currently + // remove the old value, we'd need to work a little harder + #[test] + fn test_prohibit_get_for_garbage_collected_data() -> Result<()> { + let repo = + RepoHarness::create("test_prohibit_get_for_garbage_collected_data")? + .load(); - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); - assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); - // FIXME: GC is currently disabled, so this still works - /* - match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) { - Ok(_) => panic!("request for page should have failed"), - Err(err) => assert!(err.to_string().contains("not found at")), - } - */ - Ok(()) + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; + + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn(); + assert!(*latest_gc_cutoff_lsn > Lsn(0x25)); + match tline.get(*TEST_KEY, Lsn(0x25)) { + Ok(_) => panic!("request for page should have failed"), + Err(err) => assert!(err.to_string().contains("not found at")), } + Ok(()) + } + */ - #[test] - fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { - let repo = - RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load_page_repo(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + #[test] + fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> { + let repo = + RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; - // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok()); + // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50 + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok()); - Ok(()) - } + Ok(()) + } + #[test] + fn test_parent_keeps_data_forever_after_branching() -> Result<()> { + let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load(); + let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + make_some_layers(tline.as_ref(), Lsn(0x20))?; - #[test] - fn test_parent_keeps_data_forever_after_branching() -> Result<()> { - let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?; - let repo = harness.load_page_repo(); - let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; - make_some_layers(&tline, Lsn(0x20))?; + repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; + let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { + Some(timeline) => timeline, + None => panic!("Should have a local timeline"), + }; - repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?; - let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() { - Some(timeline) => timeline, - None => panic!("Should have a local timeline"), - }; + make_some_layers(newtline.as_ref(), Lsn(0x60))?; - make_some_layers(&newtline, Lsn(0x60))?; + // run gc on parent + repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; - // run gc on parent - repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?; + // Check that the data is still accessible on the branch. + assert_eq!( + newtline.get(*TEST_KEY, Lsn(0x50))?, + TEST_IMG(&format!("foo at {}", Lsn(0x40))) + ); - // Check that the data is still accessible on the branch. - assert_eq!( - newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x50))?, - TEST_IMG(&format!("foo blk 0 at {}", Lsn(0x40))) - ); - - Ok(()) - } - - */ + Ok(()) + } } From 356f716d390296c2d4d6f32e4244101998228540 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 8 Mar 2022 23:19:14 +0200 Subject: [PATCH 06/55] Fixes --- pageserver/src/branches.rs | 4 ++-- pageserver/src/import_datadir.rs | 4 ++-- pageserver/src/layered_repository.rs | 14 +++++++++----- pageserver/src/pgdatadir_mapping.rs | 8 +++++--- pageserver/src/walreceiver.rs | 2 +- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index e9280f3de7..246d012e8d 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -242,9 +242,9 @@ fn bootstrap_timeline( // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - let page_tline: DatadirTimeline = DatadirTimeline::new(timeline); + let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline); - import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &page_tline, lsn)?; + import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; page_tline.tline.checkpoint(CheckpointConfig::Forced)?; println!( diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 5653c9a7ad..e2c646ae2a 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -30,7 +30,7 @@ use zenith_utils::lsn::Lsn; /// cluster was not shut down cleanly. pub fn import_timeline_from_postgres_datadir( path: &Path, - tline: &DatadirTimeline, + tline: &mut DatadirTimeline, lsn: Lsn, ) -> Result<()> { let mut pg_control: Option = None; @@ -335,7 +335,7 @@ fn import_slru_file( /// 'startpoint' and 'endpoint' into the repository. fn import_wal( walpath: &Path, - tline: &DatadirTimeline, + tline: &mut DatadirTimeline, startpoint: Lsn, endpoint: Lsn, ) -> Result<()> { diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index cd1b723651..35a864f2a6 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1028,10 +1028,11 @@ impl LayeredTimeline { if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. - bail!("could not find layer with more data for key {} at LSN {}, request LSN {}", + bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", key, Lsn(cont_lsn.0 - 1), - request_lsn) + request_lsn, + timeline.ancestor_lsn) } prev_lsn = cont_lsn; } @@ -1047,7 +1048,7 @@ impl LayeredTimeline { // Recurse into ancestor if needed if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - //info!("going into ancestor {}", timeline.ancestor_lsn); + info!("going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, cont_lsn); let ancestor = timeline.get_ancestor_timeline()?; timeline_owned = ancestor; timeline = &*timeline_owned; @@ -1060,7 +1061,7 @@ impl LayeredTimeline { // Check the open and frozen in-memory layers first if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; - if cont_lsn >= start_lsn { + if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on open layer {}", reconstruct_state.key, reconstruct_state.lsn, open_layer.filename().display()); result = open_layer.get_value_reconstruct_data(key, open_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; cont_lsn = open_layer.get_lsn_range().start; @@ -1069,7 +1070,7 @@ impl LayeredTimeline { } if let Some(frozen_layer) = &layers.frozen_layer { let start_lsn = frozen_layer.get_lsn_range().start; - if cont_lsn >= start_lsn { + if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", reconstruct_state.key, reconstruct_state.lsn, frozen_layer.filename().display()); result = frozen_layer.get_value_reconstruct_data(key, frozen_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; cont_lsn = frozen_layer.get_lsn_range().start; @@ -1594,6 +1595,9 @@ impl LayeredTimeline { fn gc(&self) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); + if true { + return Ok(result); + } let disk_consistent_lsn = self.get_disk_consistent_lsn(); let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 1c1f37ef82..fd586495dc 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -22,6 +22,7 @@ use std::sync::{Arc, RwLockReadGuard}; use tracing::{debug, info, warn}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::{Lsn, RecordLsn}; +use zenith_utils::lsn::AtomicLsn; /// Block number within a relation or SRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; @@ -31,7 +32,7 @@ where R: Repository, { pub tline: Arc, - pub last_partitioning: Option, + pub last_partitioning: AtomicLsn, } #[derive(Debug, Serialize, Deserialize)] @@ -71,7 +72,7 @@ impl DatadirTimeline { pub fn new(tline: Arc) -> Self { DatadirTimeline { tline, - last_partitioning: None, + last_partitioning: AtomicLsn::new(0), } } @@ -692,7 +693,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn finish(self) -> Result<()> { let writer = self.tline.tline.writer(); - let last_partitioning = self.last_partitioning.unwrap_or(Lsn(0)); + let last_partitioning = self.last_partitioning.load(); for (key, value) in self.pending_updates { writer.put(key, self.lsn, value)?; @@ -707,6 +708,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { let mut partitioning = self.tline.collect_keyspace(self.lsn)?; partitioning.repartition(TARGET_FILE_SIZE_BYTES); self.tline.tline.hint_partitioning(partitioning)?; + self.tline.last_partitioning.store(self.lsn); } Ok(()) diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index fd318b9cb7..17771d0151 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -242,7 +242,7 @@ fn walreceiver_main( let startlsn = Lsn::from(xlog_data.wal_start()); let endlsn = startlsn + data.len() as u64; - trace!("received XLogData between {} and {}", startlsn, endlsn); + info!("received XLogData between {} and {}", startlsn, endlsn); waldecoder.feed_bytes(data); From e096c62494a7fc370dde4aacd95d4645e513b1a1 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 01:26:36 +0200 Subject: [PATCH 07/55] Misc fixes and stuff --- pageserver/src/keyspace.rs | 1 + pageserver/src/layered_repository.rs | 56 +++++++++++-------- .../src/layered_repository/layer_map.rs | 8 +-- .../src/layered_repository/storage_layer.rs | 2 +- pageserver/src/relish.rs | 35 +++++++++++- pageserver/src/repository.rs | 2 +- 6 files changed, 72 insertions(+), 32 deletions(-) diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 4e1ebdd32a..0ad0c404ff 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -41,6 +41,7 @@ impl KeyPartitioning { if range.start == accum.end { accum.end = range.end; } else { + assert!(range.start > accum.end); self.ranges.push(accum.clone()); *accum = range; } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 35a864f2a6..a319563c33 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1011,6 +1011,8 @@ impl LayeredTimeline { let mut timeline_owned; let mut timeline = self; + let mut path: Vec<(ValueReconstructResult, Lsn, Arc)> = Vec::new(); + // 'prev_lsn' tracks the last LSN that we were at in our search. It's used // to check that each iteration make some progress, to break infinite // looping if something goes wrong. @@ -1028,6 +1030,9 @@ impl LayeredTimeline { if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. + for (r, c, l) in path { + error!("PATH: result {:?}, cont_lsn {}, layer: {}", r, c, l.filename().display()); + } bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", key, Lsn(cont_lsn.0 - 1), @@ -1062,18 +1067,20 @@ impl LayeredTimeline { if let Some(open_layer) = &layers.open_layer { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on open layer {}", reconstruct_state.key, reconstruct_state.lsn, open_layer.filename().display()); + //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); result = open_layer.get_value_reconstruct_data(key, open_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; - cont_lsn = open_layer.get_lsn_range().start; + cont_lsn = start_lsn; + path.push((result, cont_lsn, open_layer.clone())); continue; } } if let Some(frozen_layer) = &layers.frozen_layer { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { - //info!("CHECKING for {} at {} on frozen layer {}", reconstruct_state.key, reconstruct_state.lsn, frozen_layer.filename().display()); + //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); result = frozen_layer.get_value_reconstruct_data(key, frozen_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; - cont_lsn = frozen_layer.get_lsn_range().start; + cont_lsn = start_lsn; + path.push((result, cont_lsn, frozen_layer.clone())); continue; } } @@ -1081,10 +1088,11 @@ impl LayeredTimeline { if let Some(SearchResult { lsn_floor, layer }) = layers .search(key, cont_lsn)? { - //info!("CHECKING for {} at {} on historic layer {}", reconstruct_state.key, reconstruct_state.lsn, layer.filename().display()); + //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); result = layer.get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state)?; cont_lsn = lsn_floor; + path.push((result, cont_lsn, layer)); } else if self.ancestor_timeline.is_some() { // Nothing on this timeline. Traverse to parent result = ValueReconstructResult::Continue; @@ -1247,10 +1255,7 @@ impl LayeredTimeline { // TODO: the threshold for how often we create image layers is // currently hard-coded at 3. It means, write out a new image layer, // if there are at least three delta layers on top of it. - if false { - self.compact(TARGET_FILE_SIZE_BYTES as usize)?; - } - //self.compact_level0()?; + self.compact(TARGET_FILE_SIZE_BYTES as usize)?; } // TODO: We should also compact existing delta layers here. @@ -1595,9 +1600,6 @@ impl LayeredTimeline { fn gc(&self) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); - if true { - return Ok(result); - } let disk_consistent_lsn = self.get_disk_consistent_lsn(); let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); @@ -2001,15 +2003,16 @@ mod tests { const NUM_KEYS: usize = 1000; - let mut lsn = Lsn(0x10); - let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); - let mut blknum = 0; let mut parts = KeyPartitioning::new(); - for _ in 0..NUM_KEYS { - test_key.field6 = blknum; + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; let writer = tline.writer(); writer.put( test_key, @@ -2017,12 +2020,10 @@ mod tests { Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; writer.advance_last_record_lsn(lsn); + updated[blknum] = lsn; drop(writer); parts.add_key(test_key); - - lsn = Lsn(lsn.0 + 0x10); - blknum += 1; } parts.repartition(TEST_FILE_SIZE as u64); @@ -2030,20 +2031,27 @@ mod tests { for _ in 0..50 { for _ in 0..NUM_KEYS { - blknum = thread_rng().gen_range(0..NUM_KEYS) as u32; - test_key.field6 = blknum; + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; let writer = tline.writer(); writer.put( test_key, lsn, Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; + println!("updating {} at {}", blknum, lsn); writer.advance_last_record_lsn(lsn); drop(writer); - - lsn = Lsn(lsn.0 + 0x10); + updated[blknum] = lsn; } + for blknum in 0..NUM_KEYS { + test_key.field6 = blknum as u32; + assert_eq!(tline.get(test_key, lsn)?, TEST_IMG(&format!("{} at {}", blknum, updated[blknum]))); + } + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 1ad6351539..4d2af960a6 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -141,11 +141,9 @@ impl LayerMap { key, end_lsn ); - let lsn_floor = if let Some(latest_img_lsn) = latest_img_lsn { - Lsn(latest_img_lsn.0 + 1) - } else { - l.get_lsn_range().start - }; + let lsn_floor = std::cmp::max( + Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), + l.get_lsn_range().start); Ok(Some(SearchResult { lsn_floor, layer: l, diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 7ddaf7a581..76795e1bb0 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -56,7 +56,7 @@ pub struct ValueReconstructState { } /// Return value from Layer::get_page_reconstruct_data -#[derive(Debug)] +#[derive(Clone, Copy, Debug)] pub enum ValueReconstructResult { /// Got all the data needed to reconstruct the requested page Complete, diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs index 7304c41a0e..f775ce933a 100644 --- a/pageserver/src/relish.rs +++ b/pageserver/src/relish.rs @@ -26,6 +26,7 @@ //! use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; use std::fmt; use postgres_ffi::relfile_utils::forknumber_to_name; @@ -46,7 +47,10 @@ use postgres_ffi::Oid; /// are used for the same purpose. /// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57). /// -#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)] +// FIXME: should move 'forknum' as last field to keep this consistent with Postgres. +// Then we could replace the custo Ord and PartialOrd implementations below with +// deriving them. +#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)] pub struct RelTag { pub forknum: u8, pub spcnode: Oid, @@ -54,6 +58,35 @@ pub struct RelTag { pub relnode: Oid, } +impl PartialOrd for RelTag { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for RelTag { + fn cmp(&self, other: &Self) -> Ordering { + let mut cmp; + + cmp = self.spcnode.cmp(&other.spcnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.dbnode.cmp(&other.dbnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.relnode.cmp(&other.relnode); + if cmp != Ordering::Equal { + return cmp; + } + cmp = self.forknum.cmp(&other.forknum); + + cmp + } +} + + /// Display RelTag in the same format that's used in most PostgreSQL debug messages: /// /// //[_fsm|_vm|_init] diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 1abf2c365a..d8f68aaa6c 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -427,7 +427,7 @@ pub mod repo_harness { pub fn TEST_IMG(s: &str) -> Bytes { let mut buf = BytesMut::new(); buf.extend_from_slice(s.as_bytes()); - buf.resize(8192, 0); + buf.resize(64, 0); buf.freeze() } From 2896d35a8b51700adb580cac2fbf33093a2ecd69 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 15:24:23 +0200 Subject: [PATCH 08/55] rustfmt and clippy fixes --- pageserver/src/keyspace.rs | 17 ++-- pageserver/src/layered_repository.rs | 88 +++++++++++++------ .../src/layered_repository/delta_layer.rs | 25 +++--- .../src/layered_repository/image_layer.rs | 2 +- .../src/layered_repository/layer_map.rs | 30 +++---- .../src/layered_repository/storage_layer.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 26 ++++-- pageserver/src/relish.rs | 1 - pageserver/src/repository.rs | 55 +++++++----- pageserver/src/walingest.rs | 4 +- 10 files changed, 155 insertions(+), 95 deletions(-) diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 0ad0c404ff..bef2fe62a2 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -1,6 +1,6 @@ use std::ops::Range; -use crate::repository::{Key, key_range_size, singleton_range}; +use crate::repository::{key_range_size, singleton_range, Key}; use postgres_ffi::pg_constants; @@ -22,7 +22,6 @@ pub struct KeyPartitioning { } impl KeyPartitioning { - pub fn new() -> Self { KeyPartitioning { accum: None, @@ -45,7 +44,7 @@ impl KeyPartitioning { self.ranges.push(accum.clone()); *accum = range; } - }, + } None => self.accum = Some(range), } } @@ -61,11 +60,9 @@ impl KeyPartitioning { let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { - let this_size = key_range_size(&range) as usize; + let this_size = key_range_size(range) as usize; - if current_part_size + this_size > target_nblocks && - !current_part.is_empty() - { + if current_part_size + this_size > target_nblocks && !current_part.is_empty() { self.partitions.push(current_part); current_part = Vec::new(); current_part_size = 0; @@ -87,3 +84,9 @@ impl KeyPartitioning { } } } + +impl Default for KeyPartitioning { + fn default() -> Self { + Self::new() + } +} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a319563c33..49298ff853 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -19,7 +19,7 @@ use itertools::Itertools; use lazy_static::lazy_static; use tracing::*; -use std::cmp::{min, max, Ordering}; +use std::cmp::{max, min, Ordering}; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; @@ -1006,7 +1006,12 @@ impl LayeredTimeline { /// /// This function takes the current timeline's locked LayerMap as an argument, /// so callers can avoid potential race conditions. - fn get_reconstruct_data(&self, key: Key, request_lsn: Lsn, reconstruct_state: &mut ValueReconstructState) -> Result<()> { + fn get_reconstruct_data( + &self, + key: Key, + request_lsn: Lsn, + reconstruct_state: &mut ValueReconstructState, + ) -> Result<()> { // Start from the current timeline. let mut timeline_owned; let mut timeline = self; @@ -1031,7 +1036,12 @@ impl LayeredTimeline { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. for (r, c, l) in path { - error!("PATH: result {:?}, cont_lsn {}, layer: {}", r, c, l.filename().display()); + error!( + "PATH: result {:?}, cont_lsn {}, layer: {}", + r, + c, + l.filename().display() + ); } bail!("could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}", key, @@ -1053,7 +1063,11 @@ impl LayeredTimeline { // Recurse into ancestor if needed if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - info!("going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, cont_lsn); + info!( + "going into ancestor {}, cont_lsn is {}", + timeline.ancestor_lsn, + cont_lsn + ); let ancestor = timeline.get_ancestor_timeline()?; timeline_owned = ancestor; timeline = &*timeline_owned; @@ -1068,7 +1082,11 @@ impl LayeredTimeline { let start_lsn = open_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on open layer {}", key, cont_lsn, open_layer.filename().display()); - result = open_layer.get_value_reconstruct_data(key, open_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; + result = open_layer.get_value_reconstruct_data( + key, + open_layer.get_lsn_range().start..cont_lsn, + reconstruct_state, + )?; cont_lsn = start_lsn; path.push((result, cont_lsn, open_layer.clone())); continue; @@ -1078,19 +1096,25 @@ impl LayeredTimeline { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); - result = frozen_layer.get_value_reconstruct_data(key, frozen_layer.get_lsn_range().start..cont_lsn, reconstruct_state)?; + result = frozen_layer.get_value_reconstruct_data( + key, + frozen_layer.get_lsn_range().start..cont_lsn, + reconstruct_state, + )?; cont_lsn = start_lsn; path.push((result, cont_lsn, frozen_layer.clone())); continue; } } - if let Some(SearchResult { lsn_floor, layer }) = layers - .search(key, cont_lsn)? - { + if let Some(SearchResult { lsn_floor, layer }) = layers.search(key, cont_lsn)? { //info!("CHECKING for {} at {} on historic layer {}", key, cont_lsn, layer.filename().display()); - result = layer.get_value_reconstruct_data(key, lsn_floor..cont_lsn, reconstruct_state)?; + result = layer.get_value_reconstruct_data( + key, + lsn_floor..cont_lsn, + reconstruct_state, + )?; cont_lsn = lsn_floor; path.push((result, cont_lsn, layer)); } else if self.ancestor_timeline.is_some() { @@ -1427,11 +1451,16 @@ impl LayeredTimeline { } // Is it time to create a new image layer for the given partition? - fn time_for_new_image_layer(&self, partition: &Vec>, lsn: Lsn, threshold: usize) -> Result { + fn time_for_new_image_layer( + &self, + partition: &[Range], + lsn: Lsn, + threshold: usize, + ) -> Result { let layers = self.layers.lock().unwrap(); for part_range in partition { - let image_coverage = layers.image_coverage(&part_range, lsn)?; + let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { let img_lsn = if let Some(ref last_img) = last_img { last_img.get_lsn_range().end @@ -1454,7 +1483,7 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &Vec>, lsn: Lsn) -> Result<()> { + fn create_image_layer(&self, partition: &[Range], lsn: Lsn) -> Result<()> { let img_range = partition.first().unwrap().start..partition.last().unwrap().end; let mut image_layer_writer = ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; @@ -1492,18 +1521,18 @@ impl LayeredTimeline { // FIXME: this function probably won't work correctly if there's overlap // in the deltas. - let lsn_range = level0_deltas.iter().map(|l| l.get_lsn_range()).reduce(|a, b| { - min(a.start, b.start)..max(a.end, b.end) - }).unwrap(); + let lsn_range = level0_deltas + .iter() + .map(|l| l.get_lsn_range()) + .reduce(|a, b| min(a.start, b.start)..max(a.end, b.end)) + .unwrap(); let all_values_iter = level0_deltas.iter().map(|l| l.iter()).kmerge_by(|a, b| { if let Ok((a_key, a_lsn, _)) = a { if let Ok((b_key, b_lsn, _)) = b { match a_key.cmp(b_key) { Ordering::Less => true, - Ordering::Equal => { - a_lsn <= b_lsn - } + Ordering::Equal => a_lsn <= b_lsn, Ordering::Greater => false, } } else { @@ -1678,9 +1707,15 @@ impl LayeredTimeline { // OK for a delta layer to have end LSN 101, but if the end LSN // is 102, then it might not have been fully flushed to disk // before crash. - if !layers.newer_image_layer_exists(&l.get_key_range(), l.get_lsn_range().end, disk_consistent_lsn+1)? - { - info!("keeping {} because it is the latest layer", l.filename().display()); + if !layers.newer_image_layer_exists( + &l.get_key_range(), + l.get_lsn_range().end, + disk_consistent_lsn + 1, + )? { + info!( + "keeping {} because it is the latest layer", + l.filename().display() + ); result.layers_not_updated += 1; continue 'outer; } @@ -1691,7 +1726,7 @@ impl LayeredTimeline { l.filename().display(), l.is_incremental(), ); - layers_to_remove.push(Arc::clone(&l)); + layers_to_remove.push(Arc::clone(l)); } // Actually delete the layers from disk and remove them from the map. @@ -2046,9 +2081,12 @@ mod tests { updated[blknum] = lsn; } - for blknum in 0..NUM_KEYS { + for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; - assert_eq!(tline.get(test_key, lsn)?, TEST_IMG(&format!("{} at {}", blknum, updated[blknum]))); + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); } println!("checkpointing {}", lsn); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 3c5acbc53a..d8ecf7d75e 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -237,7 +237,7 @@ impl Layer for DeltaLayer { match DeltaValueIter::new(inner) { Ok(iter) => Box::new(iter), - Err(err) => Box::new(std::iter::once(Err(err))) + Err(err) => Box::new(std::iter::once(Err(err))), } } @@ -507,12 +507,12 @@ impl DeltaLayerWriter { // Note: This overwrites any existing file. There shouldn't be any. // FIXME: throw an error instead? - let path = conf - .timeline_path(&timelineid, &tenantid) - .join(format!("{}-XXX__{:016X}-{:016X}.temp", - key_start, - u64::from(lsn_range.start), - u64::from(lsn_range.end))); + let path = conf.timeline_path(&timelineid, &tenantid).join(format!( + "{}-XXX__{:016X}-{:016X}.temp", + key_start, + u64::from(lsn_range.start), + u64::from(lsn_range.end) + )); info!("temp deltalayer path {}", path.display()); let file = VirtualFile::create(&path)?; let buf_writer = BufWriter::new(file); @@ -632,7 +632,7 @@ impl DeltaLayerWriter { pub fn abort(self) { match self.values_writer.close() { - Ok(book) => { + Ok(book) => { if let Err(err) = book.close() { error!("error while closing delta layer file: {}", err); } @@ -650,7 +650,7 @@ impl DeltaLayerWriter { struct DeltaValueIter<'a> { all_offsets: Vec<(Key, Lsn, u64)>, next_idx: usize, - + inner: RwLockReadGuard<'a, DeltaLayerInner>, } @@ -671,7 +671,6 @@ impl<'a> Iterator for DeltaValueIter<'a> { /// impl<'a> DeltaValueIter<'a> { fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { - let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); index.sort_by_key(|x| x.0); @@ -693,12 +692,13 @@ impl<'a> DeltaValueIter<'a> { if self.next_idx < self.all_offsets.len() { let (key, lsn, off) = self.all_offsets[self.next_idx]; - let values_reader = self.inner + let values_reader = self + .inner .book .as_ref() .expect("should be loaded in load call above") .chapter_reader(VALUES_CHAPTER)?; - + let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?; self.next_idx += 1; @@ -708,4 +708,3 @@ impl<'a> DeltaValueIter<'a> { } } } - diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index a8772632ab..e0cb2d8d02 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -184,7 +184,7 @@ impl Layer for ImageLayer { fn iter(&self) -> Box>> { todo!(); } - + fn unload(&self) -> Result<()> { // TODO: unload 'segs'. Or even better, don't hold it in memory but // access it directly from the file (using the buffer cache) diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 4d2af960a6..27a3eb279a 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -10,8 +10,8 @@ //! corresponding files are written to disk. //! -use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::layered_repository::storage_layer::Layer; +use crate::layered_repository::storage_layer::{range_eq, range_overlaps}; use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; @@ -143,13 +143,18 @@ impl LayerMap { ); let lsn_floor = std::cmp::max( Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1), - l.get_lsn_range().start); + l.get_lsn_range().start, + ); Ok(Some(SearchResult { lsn_floor, layer: l, })) } else if let Some(l) = latest_img { - trace!("found img layer and no deltas for request on {} at {}", key, end_lsn); + trace!( + "found img layer and no deltas for request on {} at {}", + key, + end_lsn + ); Ok(Some(SearchResult { lsn_floor: latest_img_lsn.unwrap(), layer: l, @@ -202,7 +207,6 @@ impl LayerMap { lsn: Lsn, disk_consistent_lsn: Lsn, ) -> Result { - let mut range_remain = key_range.clone(); loop { @@ -212,10 +216,10 @@ impl LayerMap { continue; } let img_lsn = l.get_lsn_range().start; - if !l.is_incremental() && - l.get_key_range().contains(&range_remain.start) && - img_lsn > lsn && - img_lsn < disk_consistent_lsn + if !l.is_incremental() + && l.get_key_range().contains(&range_remain.start) + && img_lsn > lsn + && img_lsn < disk_consistent_lsn { made_progress = true; let img_key_end = l.get_key_range().end; @@ -329,11 +333,7 @@ impl LayerMap { Ok(ranges) } - pub fn count_deltas( - &self, - key_range: &Range, - lsn_range: &Range, - ) -> Result { + pub fn count_deltas(&self, key_range: &Range, lsn_range: &Range) -> Result { let mut result = 0; for l in self.historic_layers.iter() { if !l.is_incremental() { @@ -348,8 +348,8 @@ impl LayerMap { // We ignore level0 delta layers. Unless the whole keyspace fits // into one partition - if !range_eq(key_range, &(Key::MIN..Key::MAX)) && - range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) + if !range_eq(key_range, &(Key::MIN..Key::MAX)) + && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX)) { continue; } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index 76795e1bb0..d508322a81 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -130,7 +130,7 @@ pub trait Layer: Send + Sync { fn is_in_memory(&self) -> bool; fn iter(&self) -> Box> + '_>; - + /// Return a set of all distinct Keys present in this layer fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()>; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fd586495dc..50963a44cd 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -21,8 +21,8 @@ use std::ops::Range; use std::sync::{Arc, RwLockReadGuard}; use tracing::{debug, info, warn}; use zenith_utils::bin_ser::BeSer; -use zenith_utils::lsn::{Lsn, RecordLsn}; use zenith_utils::lsn::AtomicLsn; +use zenith_utils::lsn::{Lsn, RecordLsn}; /// Block number within a relation or SRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; @@ -311,11 +311,15 @@ impl DatadirTimeline { let dbdir = DbDirectory::des(&buf)?; let mut dbs: Vec<(Oid, Oid)> = dbdir.dbs.iter().cloned().collect(); - dbs.sort(); + dbs.sort_unstable(); for (spcnode, dbnode) in dbs { result.add_key(relmap_file_key(spcnode, dbnode)); - let mut rels: Vec = self.list_rels(spcnode, dbnode, lsn)?.iter().cloned().collect(); - rels.sort(); + let mut rels: Vec = self + .list_rels(spcnode, dbnode, lsn)? + .iter() + .cloned() + .collect(); + rels.sort_unstable(); for rel in rels { let relsize_key = rel_size_to_key(rel); let mut buf = self.tline.get(relsize_key, lsn)?; @@ -327,18 +331,24 @@ impl DatadirTimeline { } // Iterate SLRUs next - for kind in [SlruKind::Clog, SlruKind:: MultiXactMembers, SlruKind::MultiXactOffsets] { + for kind in [ + SlruKind::Clog, + SlruKind::MultiXactMembers, + SlruKind::MultiXactOffsets, + ] { let slrudir_key = slru_dir_to_key(kind); let buf = self.tline.get(slrudir_key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); - segments.sort(); + segments.sort_unstable(); for segno in segments { let segsize_key = slru_segment_size_to_key(kind, segno); let mut buf = self.tline.get(segsize_key, lsn)?; let segsize = buf.get_u32_le(); - result.add_range(slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize)); + result.add_range( + slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize), + ); result.add_key(segsize_key); } } @@ -347,7 +357,7 @@ impl DatadirTimeline { let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); - xids.sort(); + xids.sort_unstable(); for xid in xids { result.add_key(twophase_file_key(xid)); } diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs index f775ce933a..521e07e50f 100644 --- a/pageserver/src/relish.rs +++ b/pageserver/src/relish.rs @@ -86,7 +86,6 @@ impl Ord for RelTag { } } - /// Display RelTag in the same format that's used in most PostgreSQL debug messages: /// /// //[_fsm|_vm|_init] diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index d8f68aaa6c..c92dff661a 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -26,13 +26,12 @@ pub struct Key { } impl Key { - pub fn next(&self) -> Key { self.add(1) } - + pub fn add(&self, x: u32) -> Key { - let mut key = self.clone(); + let mut key = *self; let r = key.field6.overflowing_add(x); key.field6 = r.0; @@ -72,16 +71,14 @@ impl Key { } } - - pub fn key_range_size(key_range: &Range) -> u32 { let start = key_range.start; let end = key_range.end; - if end.field1 != start.field1 || - end.field2 != start.field2 || - end.field3 != start.field3 || - end.field4 != start.field4 + if end.field1 != start.field1 + || end.field2 != start.field2 + || end.field3 != start.field3 + || end.field4 != start.field4 { return u32::MAX; } @@ -630,20 +627,36 @@ mod tests { { let writer = tline.writer(); // Create a relation on the timeline - writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.put( + *TEST_KEY, + lsn, + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; writer.advance_last_record_lsn(lsn); lsn += 0x10; - writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.put( + *TEST_KEY, + lsn, + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; writer.advance_last_record_lsn(lsn); lsn += 0x10; } tline.checkpoint(CheckpointConfig::Forced)?; { let writer = tline.writer(); - writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.put( + *TEST_KEY, + lsn, + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; writer.advance_last_record_lsn(lsn); lsn += 0x10; - writer.put(*TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))))?; + writer.put( + *TEST_KEY, + lsn, + Value::Image(TEST_IMG(&format!("foo at {}", lsn))), + )?; writer.advance_last_record_lsn(lsn); } tline.checkpoint(CheckpointConfig::Forced) @@ -668,10 +681,10 @@ mod tests { Err(err) => { assert!(err.to_string().contains("invalid branch start lsn")); assert!(err - .source() - .unwrap() - .to_string() - .contains("we might've already garbage collected needed data")) + .source() + .unwrap() + .to_string() + .contains("we might've already garbage collected needed data")) } } @@ -689,10 +702,10 @@ mod tests { Err(err) => { assert!(&err.to_string().contains("invalid branch start lsn")); assert!(&err - .source() - .unwrap() - .to_string() - .contains("is earlier than latest GC horizon")); + .source() + .unwrap() + .to_string() + .contains("is earlier than latest GC horizon")); } } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 92133a3fd7..0051bf5361 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -996,9 +996,7 @@ mod tests { static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]); - fn init_walingest_test<'a, R: Repository>( - tline: &'a DatadirTimeline, - ) -> Result> { + fn init_walingest_test(tline: &DatadirTimeline) -> Result> { let mut writer = tline.begin_record(Lsn(0x10)); writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; writer.finish()?; From 92d1322cd5e6d1c5168c40ea6eadbd81085331f0 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 15:51:57 +0200 Subject: [PATCH 09/55] comments, other cleanup --- pageserver/src/layered_repository.rs | 10 +++++++--- pageserver/src/repository.rs | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 49298ff853..43fb6c5c9a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1986,8 +1986,8 @@ mod tests { } // - // Insert a bunch of key-value pairs with increasing keys, checkpoint, - // repeat 100 times. + // Insert 1000 key-value pairs with increasing keys, checkpoint, + // repeat 50 times. // #[test] fn test_bulk_insert() -> Result<()> { @@ -2042,6 +2042,8 @@ mod tests { let mut parts = KeyPartitioning::new(); + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. let mut updated = [Lsn(0); NUM_KEYS]; let mut lsn = Lsn(0); @@ -2081,6 +2083,7 @@ mod tests { updated[blknum] = lsn; } + // Read all the blocks for (blknum, last_lsn) in updated.iter().enumerate() { test_key.field6 = blknum as u32; assert_eq!( @@ -2088,8 +2091,9 @@ mod tests { TEST_IMG(&format!("{} at {}", blknum, last_lsn)) ); } - println!("checkpointing {}", lsn); + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index c92dff661a..05ff449d21 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -139,16 +139,6 @@ impl Key { field6: u32::from_str_radix(&s[28..36], 16)?, }) } - - pub fn to_prefix_128(&self) -> u128 { - assert!(self.field1 & 0xf0 == 0); - (self.field1 as u128) << 124 - | (self.field2 as u128) << 92 - | (self.field3 as u128) << 60 - | (self.field4 as u128) << 28 - | (self.field5 as u128) << 20 - | (self.field6 as u128) >> 12 - } } // @@ -365,6 +355,17 @@ pub trait Timeline: Send + Sync { /// know anything about them here in the repository. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>; + /// + /// Tell the implementation how the keyspace should be partitioned. + /// + /// FIXME: This is quite a hack. The code in pgdatadir_mapping.rs knows + /// which keys exist and what is the logical grouping of them. That's why + /// the code there (and in keyspace.rs) decides the partitioning, not the + /// layered_repository.rs implementation. That's a layering violation: + /// the Repository implementation ought to be responsible for the physical + /// layout, but currently it's more convenient to do it in pgdatadir_mapping.rs + /// rather than in layered_repository.rs. + /// fn hint_partitioning(&self, partitioning: KeyPartitioning) -> Result<()>; /// From 98ec8418c463d179fe7e5f465aac3c44ee02c975 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 16:55:07 +0200 Subject: [PATCH 10/55] Fix bug with the partitioning and GC --- pageserver/src/layered_repository.rs | 18 ++- pageserver/src/pgdatadir_mapping.rs | 4 +- pageserver/src/repository.rs | 2 +- test_runner/batch_others/test_snapfiles_gc.py | 131 ------------------ 4 files changed, 11 insertions(+), 144 deletions(-) delete mode 100644 test_runner/batch_others/test_snapfiles_gc.py diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 43fb6c5c9a..fd4d494d3b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -742,7 +742,7 @@ pub struct LayeredTimeline { // garbage collecting data that is still needed by the child timelines. gc_info: RwLock, - partitioning: RwLock>, + partitioning: RwLock>, // It may change across major versions so for simplicity // keep it after running initdb for a timeline. @@ -859,8 +859,8 @@ impl Timeline for LayeredTimeline { self.disk_consistent_lsn.load() } - fn hint_partitioning(&self, partitioning: KeyPartitioning) -> Result<()> { - self.partitioning.write().unwrap().replace(partitioning); + fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()> { + self.partitioning.write().unwrap().replace((partitioning, lsn)); Ok(()) } @@ -1429,17 +1429,15 @@ impl LayeredTimeline { // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. - let lsn = self.last_record_lsn.load().last; - // 1. The partitioning was already done by the code in // pgdatadir_mapping.rs. We just use it here. let partitioning = self.partitioning.read().unwrap(); - if let Some(partitioning) = partitioning.as_ref() { + if let Some((partitioning, lsn)) = partitioning.as_ref() { // 2. Create new image layers for partitions that have been modified // "enough". for partition in &partitioning.partitions { - if self.time_for_new_image_layer(partition, lsn, 3)? { - self.create_image_layer(partition, lsn)?; + if self.time_for_new_image_layer(partition, *lsn, 3)? { + self.create_image_layer(partition, *lsn)?; } } // 3. Compact @@ -2020,7 +2018,7 @@ mod tests { let cutoff = tline.get_last_record_lsn(); parts.repartition(TEST_FILE_SIZE as u64); - tline.hint_partitioning(parts.clone())?; + tline.hint_partitioning(parts.clone(), lsn)?; tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; @@ -2064,7 +2062,7 @@ mod tests { } parts.repartition(TEST_FILE_SIZE as u64); - tline.hint_partitioning(parts)?; + tline.hint_partitioning(parts, lsn)?; for _ in 0..50 { for _ in 0..NUM_KEYS { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 50963a44cd..01f52e7a2d 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -714,10 +714,10 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { writer.advance_last_record_lsn(self.lsn); - if self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 { + if last_partitioning == Lsn(0) || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 { let mut partitioning = self.tline.collect_keyspace(self.lsn)?; partitioning.repartition(TARGET_FILE_SIZE_BYTES); - self.tline.tline.hint_partitioning(partitioning)?; + self.tline.tline.hint_partitioning(partitioning, self.lsn)?; self.tline.last_partitioning.store(self.lsn); } diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 05ff449d21..000e58f4ab 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -366,7 +366,7 @@ pub trait Timeline: Send + Sync { /// layout, but currently it's more convenient to do it in pgdatadir_mapping.rs /// rather than in layered_repository.rs. /// - fn hint_partitioning(&self, partitioning: KeyPartitioning) -> Result<()>; + fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()>; /// /// Check that it is valid to request operations with that lsn. diff --git a/test_runner/batch_others/test_snapfiles_gc.py b/test_runner/batch_others/test_snapfiles_gc.py deleted file mode 100644 index c6d4512bc9..0000000000 --- a/test_runner/batch_others/test_snapfiles_gc.py +++ /dev/null @@ -1,131 +0,0 @@ -from contextlib import closing -import psycopg2.extras -import time -from fixtures.utils import print_gc_result -from fixtures.zenith_fixtures import ZenithEnv -from fixtures.log_helper import log - - -# -# Test Garbage Collection of old layer files -# -# This test is pretty tightly coupled with the current implementation of layered -# storage, in layered_repository.rs. -# -def test_layerfiles_gc(zenith_simple_env: ZenithEnv): - env = zenith_simple_env - env.zenith_cli.create_branch("test_layerfiles_gc", "empty") - pg = env.postgres.create_start('test_layerfiles_gc') - - with closing(pg.connect()) as conn: - with conn.cursor() as cur: - with closing(env.pageserver.connect()) as psconn: - with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur: - - # Get the timeline ID of our branch. We need it for the 'do_gc' command - cur.execute("SHOW zenith.zenith_timeline") - timeline = cur.fetchone()[0] - - # Create a test table - cur.execute("CREATE TABLE foo(x integer)") - cur.execute("INSERT INTO foo VALUES (1)") - - cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass") - row = cur.fetchone() - log.info(f"relfilenode is {row[0]}") - - # Run GC, to clear out any garbage left behind in the catalogs by - # the CREATE TABLE command. We want to have a clean slate with no garbage - # before running the actual tests below, otherwise the counts won't match - # what we expect. - # - # Also run vacuum first to make it less likely that autovacuum or pruning - # kicks in and confuses our numbers. - cur.execute("VACUUM") - - # delete the row, to update the Visibility Map. We don't want the VM - # update to confuse our numbers either. - cur.execute("DELETE FROM foo") - - log.info("Running GC before test") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - # remember the number of files - layer_relfiles_remain = (row['layer_relfiles_total'] - - row['layer_relfiles_removed']) - assert layer_relfiles_remain > 0 - - # Insert a row and run GC. Checkpoint should freeze the layer - # so that there is only the most recent image layer left for the rel, - # removing the old image and delta layer. - log.info("Inserting one row and running GC") - cur.execute("INSERT INTO foo VALUES (1)") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Insert two more rows and run GC. - # This should create new image and delta layer file with the new contents, and - # then remove the old one image and the just-created delta layer. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Do it again. Should again create two new layer files and remove old ones. - log.info("Inserting two more rows and running GC") - cur.execute("INSERT INTO foo VALUES (2)") - cur.execute("INSERT INTO foo VALUES (3)") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain + 2 - assert row['layer_relfiles_removed'] == 2 - assert row['layer_relfiles_dropped'] == 0 - - # Run GC again, with no changes in the database. Should not remove anything. - log.info("Run GC again, with nothing to do") - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - assert row['layer_relfiles_total'] == layer_relfiles_remain - assert row['layer_relfiles_removed'] == 0 - assert row['layer_relfiles_dropped'] == 0 - - # - # Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage - # - log.info("Drop table and run GC again") - cur.execute("DROP TABLE foo") - - pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0") - row = pscur.fetchone() - print_gc_result(row) - - # We still cannot remove the latest layers - # because they serve as tombstones for earlier layers. - assert row['layer_relfiles_dropped'] == 0 - # Each relation fork is counted separately, hence 3. - assert row['layer_relfiles_needed_as_tombstone'] == 3 - - # The catalog updates also create new layer files of the catalogs, which - # are counted as 'removed' - assert row['layer_relfiles_removed'] > 0 - - # TODO Change the test to check actual CG of dropped layers. - # Each relation fork is counted separately, hence 3. - #assert row['layer_relfiles_dropped'] == 3 - - # TODO: perhaps we should count catalog and user relations separately, - # to make this kind of testing more robust From da8beffc9511d398c5738e408bb7d0d337188ee9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 19:52:04 +0200 Subject: [PATCH 11/55] Fix logical timeline size tracking --- pageserver/src/branches.rs | 38 ++++++------ pageserver/src/http/routes.rs | 3 +- pageserver/src/layered_repository.rs | 5 +- pageserver/src/pgdatadir_mapping.rs | 87 ++++++++++++++++++++++++---- pageserver/src/tenant_mgr.rs | 1 + 5 files changed, 101 insertions(+), 33 deletions(-) diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs index 246d012e8d..912f1d7af0 100644 --- a/pageserver/src/branches.rs +++ b/pageserver/src/branches.rs @@ -23,11 +23,11 @@ use zenith_utils::{crashsafe_dir, logging}; use crate::config::PageServerConf; use crate::pgdatadir_mapping::DatadirTimeline; use crate::repository::{Repository, Timeline}; +use crate::tenant_mgr; use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::RepositoryImpl; use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{repository::RepositoryTimeline, tenant_mgr}; #[derive(Serialize, Deserialize, Clone)] pub struct BranchInfo { @@ -42,10 +42,10 @@ pub struct BranchInfo { } impl BranchInfo { - pub fn from_path>( + pub fn from_path>( path: T, - repo: &R, - _include_non_incremental_logical_size: bool, + tenantid: ZTenantId, + include_non_incremental_logical_size: bool, ) -> Result { let path = path.as_ref(); let name = path.file_name().unwrap().to_string_lossy().to_string(); @@ -58,35 +58,35 @@ impl BranchInfo { })? .parse::()?; - let timeline = match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(local_entry) => local_entry, - RepositoryTimeline::Remote { .. } => { - bail!("Timeline {} is remote, no branches to display", timeline_id) + let timeline = match tenant_mgr::get_timeline_for_tenant(tenantid, timeline_id) { + Ok(timeline) => timeline, + Err(err) => { + // FIXME: this was: + // bail!("Timeline {} is remote, no branches to display", timeline_id) + // + // but we cannot distinguish that from other errors now. Have + // get_timeline_for_tenant() return a more specific error + return Err(err); } }; // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { + let (ancestor_id, ancestor_lsn) = match timeline.tline.get_ancestor_timeline_id() { Some(ancestor_id) => ( Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), + Some(timeline.tline.get_ancestor_lsn().to_string()), ), None => (None, None), }; // non incremental size calculation can be heavy, so let it be optional // needed for tests to check size calculation - // - // FIXME - /* let current_logical_size_non_incremental = include_non_incremental_logical_size .then(|| { timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) }) .transpose()?; - */ - let current_logical_size_non_incremental = Some(0); - let current_logical_size = 0; + let current_logical_size = timeline.get_current_logical_size(); Ok(BranchInfo { name, @@ -94,7 +94,7 @@ impl BranchInfo { latest_valid_lsn: timeline.get_last_record_lsn(), ancestor_id, ancestor_lsn, - current_logical_size, // : timeline.get_current_logical_size(), + current_logical_size, current_logical_size_non_incremental, }) } @@ -268,8 +268,6 @@ pub(crate) fn get_branches( tenantid: &ZTenantId, include_non_incremental_logical_size: bool, ) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - // Each branch has a corresponding record (text file) in the refs/branches // with timeline_id. let branches_dir = conf.branches_path(tenantid); @@ -292,7 +290,7 @@ pub(crate) fn get_branches( })?; BranchInfo::from_path( dir_entry.path(), - repo.as_ref(), + *tenantid, include_non_incremental_logical_size, ) }) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 4794bf72b9..3449585b63 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -138,8 +138,7 @@ async fn branch_detail_handler(request: Request) -> Result, let response_data = tokio::task::spawn_blocking(move || { let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, repo.as_ref(), include_non_incremental_logical_size) + BranchInfo::from_path(path, tenantid, include_non_incremental_logical_size) }) .await .map_err(ApiError::from_err)??; diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index fd4d494d3b..c3d910549b 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -860,7 +860,10 @@ impl Timeline for LayeredTimeline { } fn hint_partitioning(&self, partitioning: KeyPartitioning, lsn: Lsn) -> Result<()> { - self.partitioning.write().unwrap().replace((partitioning, lsn)); + self.partitioning + .write() + .unwrap() + .replace((partitioning, lsn)); Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 01f52e7a2d..fb68490d44 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -18,8 +18,9 @@ use postgres_ffi::{pg_constants, Oid, TransactionId}; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; use std::ops::Range; +use std::sync::atomic::{AtomicIsize, Ordering}; use std::sync::{Arc, RwLockReadGuard}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::AtomicLsn; use zenith_utils::lsn::{Lsn, RecordLsn}; @@ -33,6 +34,7 @@ where { pub tline: Arc, pub last_partitioning: AtomicLsn, + pub current_logical_size: AtomicIsize, } #[derive(Debug, Serialize, Deserialize)] @@ -73,9 +75,19 @@ impl DatadirTimeline { DatadirTimeline { tline, last_partitioning: AtomicLsn::new(0), + current_logical_size: AtomicIsize::new(0), } } + pub fn init_logical_size(&self) -> Result<()> { + let last_lsn = self.tline.get_last_record_lsn(); + self.current_logical_size.store( + self.get_current_logical_size_non_incremental(last_lsn)? as isize, + Ordering::SeqCst, + ); + Ok(()) + } + //------------------------------------------------------------------------------ // Public GET functions //------------------------------------------------------------------------------ @@ -270,6 +282,7 @@ impl DatadirTimeline { lsn, pending_updates: HashMap::new(), pending_deletions: Vec::new(), + pending_nblocks: 0, } } @@ -286,17 +299,41 @@ impl DatadirTimeline { /// Retrieve current logical size of the timeline /// /// NOTE: counted incrementally, includes ancestors, - /// doesnt support TwoPhase relishes yet pub fn get_current_logical_size(&self) -> usize { - //todo!() - 0 + let current_logical_size = self.current_logical_size.load(Ordering::Acquire); + match usize::try_from(current_logical_size) { + Ok(sz) => sz, + Err(_) => { + error!( + "current_logical_size is out of range: {}", + current_logical_size + ); + 0 + } + } } /// Does the same as get_current_logical_size but counted on demand. - /// Used in tests to ensure that incremental and non incremental variants match. - pub fn get_current_logical_size_non_incremental(&self, _lsn: Lsn) -> Result { - //todo!() - Ok(0) + /// Used to initialize the logical size tracking on startup. + /// + /// Only relation blocks are counted currently. That excludes metadata, + /// SLRUs, twophase files etc. + pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result { + // Fetch list of database dirs and iterate them + let buf = self.tline.get(DBDIR_KEY, lsn)?; + let dbdir = DbDirectory::des(&buf)?; + + let mut total_size: usize = 0; + for (spcnode, dbnode) in dbdir.dbs { + for rel in self.list_rels(spcnode, dbnode, lsn)? { + let relsize_key = rel_size_to_key(rel); + let mut buf = self.tline.get(relsize_key, lsn)?; + let relsize = buf.get_u32_le(); + + total_size += relsize as usize; + } + } + Ok(total_size * pg_constants::BLCKSZ as usize) } fn collect_keyspace(&self, lsn: Lsn) -> Result { @@ -375,6 +412,7 @@ pub struct DatadirTimelineWriter<'a, R: Repository> { lsn: Lsn, pending_updates: HashMap, pending_deletions: Vec>, + pending_nblocks: isize, } // TODO Currently, Deref is used to allow easy access to read methods from this trait. @@ -534,6 +572,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { ); } + // FIXME: update pending_nblocks + // Delete all relations and metadata files for the spcnode/dnode self.delete(dbdir_key_range(spcnode, dbnode)); Ok(()) @@ -568,6 +608,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + self.pending_nblocks += nblocks as isize; + // even if nblocks > 0, we don't insert any actual blocks here Ok(()) @@ -577,8 +619,13 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { // Put size let size_key = rel_size_to_key(rel); + + let old_size = self.get(size_key)?.get_u32_le(); + let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + self.pending_nblocks -= old_size as isize - nblocks as isize; Ok(()) } @@ -629,8 +676,13 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { // Put size let size_key = rel_size_to_key(rel); + + let old_size = self.get(size_key)?.get_u32_le(); + let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + self.pending_nblocks += nblocks as isize - old_size as isize; Ok(()) } @@ -647,6 +699,11 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { warn!("dropped rel {} did not exist in rel directory", rel); } + // update logical size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + self.pending_nblocks -= old_size as isize; + // Delete size entry, as well as all blocks self.delete(rel_key_range(rel)); @@ -704,23 +761,33 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { let writer = self.tline.tline.writer(); let last_partitioning = self.last_partitioning.load(); + let pending_nblocks = self.pending_nblocks; for (key, value) in self.pending_updates { writer.put(key, self.lsn, value)?; } for key_range in self.pending_deletions { - writer.delete(key_range, self.lsn)?; + writer.delete(key_range.clone(), self.lsn)?; } writer.advance_last_record_lsn(self.lsn); - if last_partitioning == Lsn(0) || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 { + if last_partitioning == Lsn(0) + || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 + { let mut partitioning = self.tline.collect_keyspace(self.lsn)?; partitioning.repartition(TARGET_FILE_SIZE_BYTES); self.tline.tline.hint_partitioning(partitioning, self.lsn)?; self.tline.last_partitioning.store(self.lsn); } + if pending_nblocks != 0 { + self.tline.current_logical_size.fetch_add( + pending_nblocks * pg_constants::BLCKSZ as isize, + Ordering::SeqCst, + ); + } + Ok(()) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index 7adea39b6a..dc53ffebbe 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -284,6 +284,7 @@ pub fn get_timeline_for_tenant( .with_context(|| format!("cannot fetch timeline {}", timelineid))?; let page_tline = Arc::new(DatadirTimelineImpl::new(tline)); + page_tline.init_logical_size()?; tenant.timelines.insert(timelineid, Arc::clone(&page_tline)); Ok(page_tline) } From e7bd74d5583a311cf271b5487149b8dbd7119a4c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 9 Mar 2022 20:35:16 +0200 Subject: [PATCH 12/55] Tidy up --- pageserver/src/import_datadir.rs | 14 ++---- pageserver/src/keyspace.rs | 6 +-- pageserver/src/layered_repository.rs | 5 +-- .../src/layered_repository/delta_layer.rs | 43 ++++--------------- pageserver/src/layered_repository/filename.rs | 5 +-- .../src/layered_repository/image_layer.rs | 21 +-------- .../src/layered_repository/inmemory_layer.rs | 9 +--- .../src/layered_repository/storage_layer.rs | 42 +++++++++--------- pageserver/src/pgdatadir_mapping.rs | 3 +- pageserver/src/walreceiver.rs | 2 +- 10 files changed, 43 insertions(+), 107 deletions(-) diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index e2c646ae2a..bd3de96035 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -201,7 +201,7 @@ fn import_relfile( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + ensure!(blknum == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -211,17 +211,11 @@ fn import_relfile( }; blknum += 1; } - ensure!(blknum == nblocks as u32); Ok(()) } -/// FIXME -/// Import a "non-blocky" file into the repository -/// -/// This is used for small files like the control file, twophase files etc. that -/// are just slurped into the repository as one blob. -/// +/// Import a relmapper (pg_filenode.map) file into the repository fn import_relmap_file( timeline: &mut DatadirTimelineWriter, spcnode: Oid, @@ -239,6 +233,7 @@ fn import_relmap_file( Ok(()) } +/// Import a twophase state file (pg_twophase/) into the repository fn import_twophase_file( timeline: &mut DatadirTimelineWriter, xid: TransactionId, @@ -316,7 +311,7 @@ fn import_slru_file( Err(err) => match err.kind() { std::io::ErrorKind::UnexpectedEof => { // reached EOF. That's expected. - // FIXME: maybe check that we read the full length of the file? + ensure!(rpageno == nblocks as u32, "unexpected EOF"); break; } _ => { @@ -326,7 +321,6 @@ fn import_slru_file( }; rpageno += 1; } - ensure!(rpageno == nblocks as u32); Ok(()) } diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index bef2fe62a2..d2633b573e 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -4,10 +4,8 @@ use crate::repository::{key_range_size, singleton_range, Key}; use postgres_ffi::pg_constants; -// in # of key-value pairs -// FIXME Size of one segment in pages (128 MB) -pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; -pub const TARGET_FILE_SIZE: usize = (TARGET_FILE_SIZE_BYTES / 8192) as usize; +// Target file size, when creating iage and delta layers +pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB /// /// Represents a set of Keys, in a compact form. diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c3d910549b..4440ca4b58 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1066,7 +1066,7 @@ impl LayeredTimeline { // Recurse into ancestor if needed if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - info!( + trace!( "going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, cont_lsn @@ -1693,7 +1693,6 @@ impl LayeredTimeline { "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}", l.filename().display(), retain_lsn, - //is_dropped, // FIXME l.is_incremental(), ); result.layers_needed_by_branches += 1; @@ -1800,7 +1799,7 @@ impl LayeredTimeline { self.walredo_mgr .request_redo(key, request_lsn, base_img, data.records)?; - // FIXME + // FIXME: page caching /* if let RelishTag::Relation(rel_tag) = &rel { let cache = page_cache::get(); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index d8ecf7d75e..641db7930b 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -43,7 +43,7 @@ use crate::{ZTenantId, ZTimelineId}; use anyhow::{bail, Result}; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use zenith_utils::vec_map::VecMap; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods @@ -158,16 +158,6 @@ impl Layer for DeltaLayer { assert!(self.key_range.contains(&key)); - /* FIXME - match &reconstruct_state.img { - Some((cached_lsn, _)) if &self.lsn_range.end <= cached_lsn => { - reconstruct_state.lsn = *cached_lsn; - return Ok(ValueReconstructResult::Complete); - } - _ => {} - } - */ - { // Open the file and lock the metadata in memory let inner = self.load()?; @@ -181,15 +171,6 @@ impl Layer for DeltaLayer { if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); for (entry_lsn, pos) in slice.iter().rev() { - /* FIXME - match &reconstruct_state.img { - Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { - return Ok(ValueReconstructResult::Complete); - } - _ => {} - } - */ - let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?; match val { Value::Image(img) => { @@ -221,14 +202,6 @@ impl Layer for DeltaLayer { } } - // Return a set of all distinct Keys present in this layer - fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { - let inner = self.load()?; - - keys.extend(inner.index.keys().filter(|x| key_range.contains(x))); - Ok(()) - } - fn iter(&self) -> Box> + '_> { let inner = self.load().unwrap(); @@ -647,6 +620,13 @@ impl DeltaLayerWriter { } } +/// +/// Iterator over all key-value pairse stored in a delta layer +/// +/// FIXME: This creates a Vector to hold the offsets of all key value pairs. +/// That takes up quite a lot of memory. Should do this in a more streaming +/// fashion. +/// struct DeltaValueIter<'a> { all_offsets: Vec<(Key, Lsn, u64)>, next_idx: usize, @@ -662,13 +642,6 @@ impl<'a> Iterator for DeltaValueIter<'a> { } } -/// -/// Iterator over all key-value pairse stored in a delta layer -/// -/// FIXME: This creates a Vector to hold the offsets of all key value pairs. -/// That takes up quite a lot of memory. Should do this in a more streaming -/// fashion. -/// impl<'a> DeltaValueIter<'a> { fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); diff --git a/pageserver/src/layered_repository/filename.rs b/pageserver/src/layered_repository/filename.rs index e9d9c9dbbd..cd63f014c4 100644 --- a/pageserver/src/layered_repository/filename.rs +++ b/pageserver/src/layered_repository/filename.rs @@ -47,7 +47,7 @@ impl Ord for DeltaFileName { /// Represents the filename of a DeltaLayer /// -/// --- +/// -__- /// impl DeltaFileName { /// @@ -136,8 +136,7 @@ impl Ord for ImageFileName { /// /// Represents the filename of an ImageLayer /// -/// _____ -/// FIXME +/// -__ impl ImageFileName { /// /// Parse a string as an image file name. Returns None if the filename does not diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index e0cb2d8d02..2ce57e9940 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -28,7 +28,7 @@ use anyhow::{bail, Context, Result}; use bytes::Bytes; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::fs; use std::io::{BufWriter, Write}; use std::ops::Range; @@ -137,16 +137,6 @@ impl Layer for ImageLayer { assert!(self.key_range.contains(&key)); assert!(lsn_range.end >= self.lsn); - /* FIXME - match reconstruct_state.img { - Some((cached_lsn, _)) if self.lsn <= cached_lsn => { - reconstruct_state.lsn = cached_lsn; - return Ok(ValueReconstructResult::Complete); - } - _ => {} - } - */ - let inner = self.load()?; if let Some(offset) = inner.index.get(&key) { @@ -172,15 +162,6 @@ impl Layer for ImageLayer { } } - fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { - let inner = self.load()?; - - let index = &inner.index; - - keys.extend(index.keys().filter(|x| key_range.contains(x))); - Ok(()) - } - fn iter(&self) -> Box>> { todo!(); } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index b6f06d143b..c623630851 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -15,7 +15,7 @@ use crate::repository::{Key, Value}; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use log::*; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; @@ -171,13 +171,6 @@ impl Layer for InMemoryLayer { } } - fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()> { - let inner = self.inner.read().unwrap(); - - keys.extend(inner.index.keys().filter(|x| key_range.contains(x))); - Ok(()) - } - fn iter(&self) -> Box>> { todo!(); } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index d508322a81..c5314350c8 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -7,7 +7,6 @@ use crate::walrecord::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; -use std::collections::HashSet; use std::ops::Range; use std::path::PathBuf; @@ -31,22 +30,20 @@ where a.start == b.start && a.end == b.end } -/// FIXME -/// Struct used to communicate across calls to 'get_page_reconstruct_data'. +/// Struct used to communicate across calls to 'get_value_reconstruct_data'. /// -/// Before first call to get_page_reconstruct_data, you can fill in 'page_img' -/// if you have an older cached version of the page available. That can save -/// work in 'get_page_reconstruct_data', as it can stop searching for page -/// versions when all the WAL records going back to the cached image have been -/// collected. +/// Before first call, you can fill in 'page_img' if you have an older cached +/// version of the page available. That can save work in +/// 'get_value_reconstruct_data', as it can stop searching for page versions +/// when all the WAL records going back to the cached image have been collected. /// -/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an -/// image of the page, or the oldest WAL record in 'records' is a will_init-type +/// When get_value_reconstruct_data returns Complete, 'img' is set to an image +/// of the page, or the oldest WAL record in 'records' is a will_init-type /// record that initializes the page without requiring a previous image. /// /// If 'get_page_reconstruct_data' returns Continue, some 'records' may have /// been collected, but there are more records outside the current layer. Pass -/// the same PageReconstructData struct in the next 'get_page_reconstruct_data' +/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data' /// call, to collect more records. /// #[derive(Debug)] @@ -70,13 +67,19 @@ pub enum ValueReconstructResult { Missing, } -/// FIXME -/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs. +/// A Layer contains all data in a "rectangle" consisting of a range of keys and +/// range of LSNs. +/// /// There are two kinds of layers, in-memory and on-disk layers. In-memory -/// layers are used to ingest incoming WAL, and provide fast access -/// to the recent page versions. On-disk layers are stored as files on disk, and -/// are immutable. This trait presents the common functionality of -/// in-memory and on-disk layers. +/// layers are used to ingest incoming WAL, and provide fast access to the +/// recent page versions. On-disk layers are stored as files on disk, and are +/// immutable. This trait presents the common functionality of in-memory and +/// on-disk layers. +/// +/// Furthermore, there are two kinds of on-disk layers: delta and image layers. +/// A delta layer contains all modifications within a range of LSNs and keys. +/// An image layer is a snapshot of all the data in a key-range, at a single +/// LSN /// pub trait Layer: Send + Sync { fn get_tenant_id(&self) -> ZTenantId; @@ -87,7 +90,6 @@ pub trait Layer: Send + Sync { /// Range of segments that this layer covers fn get_key_range(&self) -> Range; - /// FIXME /// Inclusive start bound of the LSN range that this layer holds /// Exclusive end bound of the LSN range that this layer holds. /// @@ -129,11 +131,9 @@ pub trait Layer: Send + Sync { /// Returns true for layers that are represented in memory. fn is_in_memory(&self) -> bool; + /// Iterate through all keys and values stored in the layer fn iter(&self) -> Box> + '_>; - /// Return a set of all distinct Keys present in this layer - fn collect_keys(&self, key_range: &Range, keys: &mut HashSet) -> Result<()>; - /// Release memory used by this layer. There is no corresponding 'load' /// function, that's done implicitly when you call one of the get-functions. fn unload(&self) -> Result<()>; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index fb68490d44..2eb8720ae5 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -1,6 +1,6 @@ //! //! This provides an abstraction to store PostgreSQL relations and other files -//! in the key-value store +//! in the key-value store that implements the Repository interface. //! //! (TODO: The line between PUT-functions here and walingest.rs is a bit blurry, as //! walingest.rs handles a few things like implicit relation creation and extension. @@ -592,7 +592,6 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // - update relish header with size pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - info!("CREAT: {}", rel); // Add it to the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let buf = self.get(dir_key)?; diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 17771d0151..fd318b9cb7 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -242,7 +242,7 @@ fn walreceiver_main( let startlsn = Lsn::from(xlog_data.wal_start()); let endlsn = startlsn + data.len() as u64; - info!("received XLogData between {} and {}", startlsn, endlsn); + trace!("received XLogData between {} and {}", startlsn, endlsn); waldecoder.feed_bytes(data); From fb79c7f1f0e4237a4af0b47f6d17104111294466 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 00:33:25 +0200 Subject: [PATCH 13/55] Make compaction more concurrent --- pageserver/src/layered_repository.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 4440ca4b58..61d1474b2a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1434,15 +1434,22 @@ impl LayeredTimeline { // 1. The partitioning was already done by the code in // pgdatadir_mapping.rs. We just use it here. - let partitioning = self.partitioning.read().unwrap(); - if let Some((partitioning, lsn)) = partitioning.as_ref() { + let partitioning_guard = self.partitioning.read().unwrap(); + if let Some((partitioning, lsn)) = partitioning_guard.as_ref() { + // Make a copy of the partitioning, so that we can release + // the lock. Otherwise we could block the WAL receiver. + let lsn = *lsn; + let partitions = partitioning.partitions.clone(); + drop(partitioning_guard); + // 2. Create new image layers for partitions that have been modified // "enough". - for partition in &partitioning.partitions { - if self.time_for_new_image_layer(partition, *lsn, 3)? { - self.create_image_layer(partition, *lsn)?; + for partition in partitions.iter() { + if self.time_for_new_image_layer(partition, lsn, 3)? { + self.create_image_layer(partition, lsn)?; } } + // 3. Compact self.compact_level0(target_file_size)?; } else { @@ -1508,7 +1515,7 @@ impl LayeredTimeline { } fn compact_level0(&self, target_file_size: usize) -> Result<()> { - let mut layers = self.layers.lock().unwrap(); + let layers = self.layers.lock().unwrap(); // We compact or "shuffle" the level-0 delta layers when 10 have // accumulated. @@ -1519,6 +1526,7 @@ impl LayeredTimeline { if level0_deltas.len() < COMPACT_THRESHOLD { return Ok(()); } + drop(layers); // FIXME: this function probably won't work correctly if there's overlap // in the deltas. @@ -1585,6 +1593,7 @@ impl LayeredTimeline { new_layers.push(writer.finish(prev_key.unwrap().next())?); } + let mut layers = self.layers.lock().unwrap(); for l in new_layers { layers.insert_historic(Arc::new(l)); } @@ -1595,6 +1604,7 @@ impl LayeredTimeline { l.delete()?; layers.remove_historic(l.clone()); } + drop(layers); Ok(()) } From dac73328ba0136b5ae8ccc28a41712bddc5e6e7b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 13:18:11 +0200 Subject: [PATCH 14/55] Fix bug where reldir was not written to image layer. --- pageserver/src/pgdatadir_mapping.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 2eb8720ae5..c44e13b11f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -351,6 +351,8 @@ impl DatadirTimeline { dbs.sort_unstable(); for (spcnode, dbnode) in dbs { result.add_key(relmap_file_key(spcnode, dbnode)); + result.add_key(rel_dir_to_key(spcnode, dbnode)); + let mut rels: Vec = self .list_rels(spcnode, dbnode, lsn)? .iter() From be4aebd7e932732ee1c014fa2175028e3594850e Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 13:36:28 +0200 Subject: [PATCH 15/55] silence clippy --- pageserver/src/layered_repository.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 61d1474b2a..b9c0192c14 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -2057,6 +2057,7 @@ mod tests { let mut updated = [Lsn(0); NUM_KEYS]; let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] for blknum in 0..NUM_KEYS { lsn = Lsn(lsn.0 + 0x10); test_key.field6 = blknum as u32; From d19a293e7e81869b6cc1b0b0ae9ce48aec46d9ab Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 14:56:13 +0200 Subject: [PATCH 16/55] Add a test for branching --- pageserver/src/layered_repository.rs | 87 ++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b9c0192c14..b4bdd7eff8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1038,6 +1038,9 @@ impl LayeredTimeline { if prev_lsn <= cont_lsn { // Didn't make any progress in last iteration. Error out to avoid // getting stuck in the loop. + + // For debugging purposes, print the path of layers that we traversed + // through. for (r, c, l) in path { error!( "PATH: result {:?}, cont_lsn {}, layer: {}", @@ -2114,4 +2117,88 @@ mod tests { Ok(()) } + + #[test] + fn test_traverse_branches() -> Result<()> { + let repo = RepoHarness::create("test_traverse_branches")?.load(); + let mut tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?; + + const NUM_KEYS: usize = 1000; + + let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); + + let mut parts = KeyPartitioning::new(); + + // Track when each page was last modified. Used to assert that + // a read sees the latest page version. + let mut updated = [Lsn(0); NUM_KEYS]; + + let mut lsn = Lsn(0); + #[allow(clippy::needless_range_loop)] + for blknum in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + writer.advance_last_record_lsn(lsn); + updated[blknum] = lsn; + drop(writer); + + parts.add_key(test_key); + } + + parts.repartition(TEST_FILE_SIZE as u64); + tline.hint_partitioning(parts, lsn)?; + + let mut tline_id = TIMELINE_ID; + for _ in 0..50 { + let new_tline_id = ZTimelineId::generate(); + repo.branch_timeline(tline_id, new_tline_id, lsn)?; + tline = if let RepositoryTimeline::Local(local) = repo.get_timeline(new_tline_id)? { + local + } else { + panic!("unexpected timeline state"); + }; + tline_id = new_tline_id; + + for _ in 0..NUM_KEYS { + lsn = Lsn(lsn.0 + 0x10); + let blknum = thread_rng().gen_range(0..NUM_KEYS); + test_key.field6 = blknum as u32; + let writer = tline.writer(); + writer.put( + test_key, + lsn, + Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), + )?; + println!("updating {} at {}", blknum, lsn); + writer.advance_last_record_lsn(lsn); + drop(writer); + updated[blknum] = lsn; + } + + // Read all the blocks + for (blknum, last_lsn) in updated.iter().enumerate() { + test_key.field6 = blknum as u32; + assert_eq!( + tline.get(test_key, lsn)?, + TEST_IMG(&format!("{} at {}", blknum, last_lsn)) + ); + } + + // Perform a cycle of checkpoint, compaction, and GC + println!("checkpointing {}", lsn); + let cutoff = tline.get_last_record_lsn(); + tline.update_gc_info(Vec::new(), cutoff); + tline.checkpoint(CheckpointConfig::Forced)?; + tline.compact(TEST_FILE_SIZE)?; + tline.gc()?; + } + + Ok(()) + } } From dd56eeefbf1ee3cdccd1e5d4bf1ae766621b5da9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 15:45:50 +0200 Subject: [PATCH 17/55] Crank up logging --- pageserver/src/layered_repository.rs | 5 ++--- pageserver/src/layered_repository/delta_layer.rs | 6 +++--- pageserver/src/layered_repository/image_layer.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 1 + 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b4bdd7eff8..1de6d2b023 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1069,10 +1069,9 @@ impl LayeredTimeline { // Recurse into ancestor if needed if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - trace!( + info!( "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, - cont_lsn + timeline.ancestor_lsn, cont_lsn ); let ancestor = timeline.get_ancestor_timeline()?; timeline_owned = ancestor; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 641db7930b..1fbbb9a451 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -583,8 +583,6 @@ impl DeltaLayerWriter { }), }; - trace!("created delta layer {}", &layer.path().display()); - // Rename the file to its final name // // Note: This overwrites any existing file. There shouldn't be any. @@ -598,7 +596,9 @@ impl DeltaLayerWriter { lsn_range: self.lsn_range, }, ); - std::fs::rename(self.path, final_path)?; + std::fs::rename(self.path, &final_path)?; + + info!("created delta layer {}", final_path.display()); Ok(layer) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 2ce57e9940..23eec2e34d 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -481,7 +481,7 @@ impl ImageLayerWriter { index: HashMap::new(), }), }; - trace!("created image layer {}", layer.path().display()); + info!("created image layer {}", layer.path().display()); self.finished = true; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c44e13b11f..6b3f4d1497 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -594,6 +594,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // - update relish header with size pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + info!("CREATE REL: {}, {} blocks at {}", rel, nblocks, self.lsn); // Add it to the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let buf = self.get(dir_key)?; From 6fb566b46f4ce179f701c3a77baae078a8d63b50 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 16:05:21 +0200 Subject: [PATCH 18/55] Bump vendor/postgres to fix a bug with smgrnblocks() on newly created rel --- vendor/postgres | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/postgres b/vendor/postgres index 31dc24ab29..3116517411 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 +Subproject commit 311651741191a2db2850f0bf6afa7d6101150be7 From 0e3512aad00362159b66771ad19fb761e6d822ff Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 18:50:12 +0200 Subject: [PATCH 19/55] Crank down logging again --- pageserver/src/layered_repository.rs | 2 +- pageserver/src/layered_repository/delta_layer.rs | 2 +- pageserver/src/layered_repository/image_layer.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 1 - 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 1de6d2b023..a007a4116d 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1069,7 +1069,7 @@ impl LayeredTimeline { // Recurse into ancestor if needed if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { - info!( + trace!( "going into ancestor {}, cont_lsn is {}", timeline.ancestor_lsn, cont_lsn ); diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 1fbbb9a451..8a9c6dc34d 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -598,7 +598,7 @@ impl DeltaLayerWriter { ); std::fs::rename(self.path, &final_path)?; - info!("created delta layer {}", final_path.display()); + trace!("created delta layer {}", final_path.display()); Ok(layer) } diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 23eec2e34d..2ce57e9940 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -481,7 +481,7 @@ impl ImageLayerWriter { index: HashMap::new(), }), }; - info!("created image layer {}", layer.path().display()); + trace!("created image layer {}", layer.path().display()); self.finished = true; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 6b3f4d1497..c44e13b11f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -594,7 +594,6 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // - update relish header with size pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - info!("CREATE REL: {}, {} blocks at {}", rel, nblocks, self.lsn); // Add it to the directory entry let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); let buf = self.get(dir_key)?; From a726b555fbbd337724b3ed898696b1fcadb3ae05 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 22:56:37 +0200 Subject: [PATCH 20/55] Handle tablespaces gracefully. We don't really support tablespaces. But this makes the 'tablespace' Postgres regression test pass, like it did previously. --- pageserver/src/basebackup.rs | 55 ++++++++++++----- pageserver/src/import_datadir.rs | 3 +- pageserver/src/layered_repository.rs | 3 +- pageserver/src/pgdatadir_mapping.rs | 89 +++++++++++++++++----------- pageserver/src/walingest.rs | 22 +++++-- 5 files changed, 114 insertions(+), 58 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 8303804213..64e9fe567e 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -125,8 +125,10 @@ impl<'a> Basebackup<'a> { self.add_slru_segment(kind, segno)?; } } - for (spcnode, dbnode) in self.timeline.list_relmap_files(self.lsn)? { - self.add_relmap_file(spcnode, dbnode)?; + + // Create tablespace directories + for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? { + self.add_dbdir(spcnode, dbnode, has_relmap_file)?; } for xid in self.timeline.list_twophase_files(self.lsn)? { self.add_twophase_file(xid)?; @@ -165,12 +167,26 @@ impl<'a> Basebackup<'a> { } // - // Extract pg_filenode.map files from repository - // Along with them also send PG_VERSION for each database. + // Include database/tablespace directories. // - fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> { - let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; - let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID { + // Each directory contains a PG_VERSION file, and the default database + // directories also contain pg_filenode.map files. + // + fn add_dbdir( + &mut self, + spcnode: u32, + dbnode: u32, + has_relmap_file: bool, + ) -> anyhow::Result<()> { + let relmap_img = if has_relmap_file { + let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?; + assert!(img.len() == 512); + Some(img) + } else { + None + }; + + if spcnode == pg_constants::GLOBALTABLESPACE_OID { let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; @@ -178,7 +194,13 @@ impl<'a> Basebackup<'a> { let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?; self.ar.append(&header, version_bytes)?; - String::from("global/pg_filenode.map") // filenode map for global tablespace + if let Some(img) = relmap_img { + // filenode map for global tablespace + let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } else { + warn!("global/pg_filenode.map is missing"); + } } else { // User defined tablespaces are not supported assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); @@ -188,16 +210,17 @@ impl<'a> Basebackup<'a> { let header = new_tar_header_dir(&path)?; self.ar.append(&header, &mut io::empty())?; - let dst_path = format!("base/{}/PG_VERSION", dbnode); - let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); - let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; - self.ar.append(&header, version_bytes)?; + if let Some(img) = relmap_img { + let dst_path = format!("base/{}/PG_VERSION", dbnode); + let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes(); + let header = new_tar_header(&dst_path, version_bytes.len() as u64)?; + self.ar.append(&header, version_bytes)?; - format!("base/{}/pg_filenode.map", dbnode) + let relmap_path = format!("base/{}/pg_filenode.map", dbnode); + let header = new_tar_header(&relmap_path, img.len() as u64)?; + self.ar.append(&header, &img[..])?; + } }; - assert!(img.len() == 512); - let header = new_tar_header(&path, img.len() as u64)?; - self.ar.append(&header, &img[..])?; Ok(()) } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index bd3de96035..f31fea02f8 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -40,7 +40,6 @@ pub fn import_timeline_from_postgres_datadir( // Scan 'global' let mut relfiles: Vec = Vec::new(); - writer.put_dbdir_creation(pg_constants::GLOBALTABLESPACE_OID, 0)?; for direntry in fs::read_dir(path.join("global"))? { let direntry = direntry?; match direntry.file_name().to_str() { @@ -85,7 +84,7 @@ pub fn import_timeline_from_postgres_datadir( None => continue, Some("PG_VERSION") => { - writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; + //writer.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?; } Some("pg_filenode.map") => import_relmap_file( &mut writer, diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a007a4116d..b4bdd7eff8 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1071,7 +1071,8 @@ impl LayeredTimeline { if Lsn(cont_lsn.0 - 1) <= timeline.ancestor_lsn { trace!( "going into ancestor {}, cont_lsn is {}", - timeline.ancestor_lsn, cont_lsn + timeline.ancestor_lsn, + cont_lsn ); let ancestor = timeline.get_ancestor_timeline()?; timeline_owned = ancestor; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c44e13b11f..8fd3e44991 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -39,8 +39,8 @@ where #[derive(Debug, Serialize, Deserialize)] pub struct DbDirectory { - // (spcnode, dbnode) - dbs: HashSet<(Oid, Oid)>, + // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) + dbdirs: HashMap<(Oid, Oid), bool>, } #[derive(Debug, Serialize, Deserialize)] @@ -210,12 +210,12 @@ impl DatadirTimeline { Ok(buf) } - pub fn list_relmap_files(&self, lsn: Lsn) -> Result> { + pub fn list_dbdirs(&self, lsn: Lsn) -> Result> { // fetch directory entry let buf = self.tline.get(DBDIR_KEY, lsn)?; let dir = DbDirectory::des(&buf)?; - Ok(dir.dbs) + Ok(dir.dbdirs) } pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result { @@ -324,8 +324,8 @@ impl DatadirTimeline { let dbdir = DbDirectory::des(&buf)?; let mut total_size: usize = 0; - for (spcnode, dbnode) in dbdir.dbs { - for rel in self.list_rels(spcnode, dbnode, lsn)? { + for (spcnode, dbnode) in dbdir.dbdirs.keys() { + for rel in self.list_rels(*spcnode, *dbnode, lsn)? { let relsize_key = rel_size_to_key(rel); let mut buf = self.tline.get(relsize_key, lsn)?; let relsize = buf.get_u32_le(); @@ -347,7 +347,7 @@ impl DatadirTimeline { let buf = self.tline.get(DBDIR_KEY, lsn)?; let dbdir = DbDirectory::des(&buf)?; - let mut dbs: Vec<(Oid, Oid)> = dbdir.dbs.iter().cloned().collect(); + let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); dbs.sort_unstable(); for (spcnode, dbnode) in dbs { result.add_key(relmap_file_key(spcnode, dbnode)); @@ -432,7 +432,7 @@ impl<'a, R: Repository> std::ops::Deref for DatadirTimelineWriter<'a, R> { impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn init_empty(&mut self) -> Result<()> { let buf = DbDirectory::ser(&DbDirectory { - dbs: HashSet::new(), + dbdirs: HashMap::new(), })?; self.put(DBDIR_KEY, Value::Image(buf.into())); @@ -512,11 +512,26 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { pub fn put_relmap_file(&mut self, spcnode: Oid, dbnode: Oid, img: Bytes) -> Result<()> { // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY)?; - let mut dir = DbDirectory::des(&buf)?; - if dir.dbs.insert((spcnode, dbnode)) { - let buf = DbDirectory::ser(&dir)?; + let mut dbdir = DbDirectory::des(&buf)?; + + let r = dbdir.dbdirs.insert((spcnode, dbnode), true); + if r == None || r == Some(false) { + // The dbdir entry didn't exist, or it contained a + // 'false'. The 'insert' call already updated it with + // 'true', now write the updated 'dbdirs' map back. + let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); } + if r == None { + // Create RelDirectory + let buf = RelDirectory::ser(&RelDirectory { + rels: HashSet::new(), + })?; + self.put( + rel_dir_to_key(spcnode, dbnode), + Value::Image(Bytes::from(buf)), + ); + } self.put(relmap_file_key(spcnode, dbnode), Value::Image(img)); Ok(()) @@ -548,23 +563,11 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { Ok(()) } - pub fn put_dbdir_creation(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { - // Create RelDirectory - let dir_key = rel_dir_to_key(spcnode, dbnode); - - let dir = RelDirectory { - rels: HashSet::new(), - }; - let buf: Bytes = RelDirectory::ser(&dir)?.into(); - self.put(dir_key, Value::Image(buf)); - Ok(()) - } - pub fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> Result<()> { // Remove entry from dbdir let buf = self.get(DBDIR_KEY)?; let mut dir = DbDirectory::des(&buf)?; - if dir.dbs.remove(&(spcnode, dbnode)) { + if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() { let buf = DbDirectory::ser(&dir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); } else { @@ -594,15 +597,36 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // - update relish header with size pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - // Add it to the directory entry - let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; - let mut dir = RelDirectory::des(&buf)?; + // It's possible that this is the first rel for this db in this tablespace. + // Create the reldir entry for it if so. + let buf = self.get(DBDIR_KEY)?; + let mut dbdir = DbDirectory::des(&buf)?; - if !dir.rels.insert((rel.relnode, rel.forknum)) { + let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let mut rel_dir; + if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { + // update dbdir + dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); + let buf = DbDirectory::ser(&dbdir)?; + self.put(DBDIR_KEY, Value::Image(buf.into())); + + // Create RelDirectory + rel_dir = RelDirectory { + rels: HashSet::new(), + }; + } else { + let buf = self.get(rel_dir_key)?; + rel_dir = RelDirectory::des(&buf)?; + } + + // Add it to the directory entry + if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { bail!("rel {} already exists", rel); } - self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + self.put( + rel_dir_key, + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), + ); // Put size let size_key = rel_size_to_key(rel); @@ -611,7 +635,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { self.pending_nblocks += nblocks as isize; - // even if nblocks > 0, we don't insert any actual blocks here + // Even if nblocks > 0, we don't insert any actual blocks here. That's up to the + // caller. Ok(()) } @@ -1155,8 +1180,6 @@ pub fn create_test_timeline( let mut writer = tline.begin_record(Lsn(8)); writer.init_empty()?; - writer.put_dbdir_creation(0, 111)?; - writer.finish()?; Ok(Arc::new(tline)) } diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 0051bf5361..430272c316 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -461,7 +461,9 @@ impl<'a, R: Repository> WalIngest<'a, R> { debug!("ingest_xlog_dbase_create: {} rels", rels.len()); - timeline.put_dbdir_creation(tablespace_id, db_id)?; + // Copy relfilemap + let filemap = timeline.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; + timeline.put_relmap_file(tablespace_id, db_id, filemap)?; let mut num_rels_copied = 0; let mut num_blocks_copied = 0; @@ -492,10 +494,6 @@ impl<'a, R: Repository> WalIngest<'a, R> { num_rels_copied += 1; } - // Copy relfilemap - let filemap = timeline.get_relmap_file(src_tablespace_id, src_db_id, req_lsn)?; - timeline.put_relmap_file(tablespace_id, db_id, filemap)?; - info!( "Created database {}/{}, copied {} blocks in {} rels", tablespace_id, db_id, num_blocks_copied, num_rels_copied @@ -514,7 +512,7 @@ impl<'a, R: Repository> WalIngest<'a, R> { relnode: rec.rnode.relnode, forknum: rec.forknum, }; - writer.put_rel_creation(rel, 0)?; + self.put_rel_creation(writer, rel)?; Ok(()) } @@ -833,6 +831,16 @@ impl<'a, R: Repository> WalIngest<'a, R> { Ok(()) } + fn put_rel_creation( + &mut self, + writer: &mut DatadirTimelineWriter, + rel: RelTag, + ) -> Result<()> { + self.relsize_cache.insert(rel, 0); + writer.put_rel_creation(rel, 0)?; + Ok(()) + } + fn put_rel_page_image( &mut self, writer: &mut DatadirTimelineWriter, @@ -999,6 +1007,7 @@ mod tests { fn init_walingest_test(tline: &DatadirTimeline) -> Result> { let mut writer = tline.begin_record(Lsn(0x10)); writer.put_checkpoint(ZERO_CHECKPOINT.clone())?; + writer.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file writer.finish()?; let walingest = WalIngest::new(tline, Lsn(0x10))?; @@ -1012,6 +1021,7 @@ mod tests { let mut walingest = init_walingest_test(&tline)?; let mut writer = tline.begin_record(Lsn(0x20)); + walingest.put_rel_creation(&mut writer, TESTREL_A)?; walingest.put_rel_page_image(&mut writer, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))?; writer.finish()?; let mut writer = tline.begin_record(Lsn(0x30)); From 3948956e872d3aa7d9e950199a74d431afb1a115 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 10 Mar 2022 23:35:24 +0200 Subject: [PATCH 21/55] Fix pg_table_size() on a view --- pageserver/src/walingest.rs | 2 +- vendor/postgres | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 430272c316..3d32410f41 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -1158,7 +1158,7 @@ mod tests { assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true); assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20))?, 1); - // Drop relish + // Drop rel let mut writer = tline.begin_record(Lsn(0x30)); walingest.put_rel_drop(&mut writer, TESTREL_A)?; writer.finish()?; diff --git a/vendor/postgres b/vendor/postgres index 3116517411..5e9bc37322 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 311651741191a2db2850f0bf6afa7d6101150be7 +Subproject commit 5e9bc3732266c072151df20d6772b47ca51e233f From f67d010d1bf3678eb8a287d47bac3fe1eed3e8cc Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 21 Feb 2022 13:40:25 -0800 Subject: [PATCH 22/55] Add ps smgr/storage metrics tenant tags Signed-off-by: Dhammika Pathirana Add tenant_id,timeline_id in smgr/storage metrics (#1234) --- pageserver/src/layered_repository.rs | 42 ++++++++++++++++++---------- pageserver/src/page_service.rs | 10 ++++--- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index c3d42d1829..63ade9bb37 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,10 +47,8 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -87,16 +85,17 @@ lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( "pageserver_storage_time", "Time spent on storage operations", - &["operation"] + &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } // Metrics collected on operations on the storage repository. lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" + "FIXME Time spent on storage operations", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } @@ -248,11 +247,19 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { - STORAGE_TIME - .with_label_values(&["gc"]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) + if let Some(timeline_id) = target_timelineid { + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_id.to_string()]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) + } else { + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), "-"]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) + } } fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { @@ -859,7 +866,11 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { + let tenant_id = self.tenantid.to_string(); + let timeline_id = self.timelineid.to_string(); + RECONSTRUCT_TIME + .with_label_values(&[&tenant_id, &timeline_id]) .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes @@ -1009,15 +1020,18 @@ impl Timeline for LayeredTimeline { /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { + let tenant_id = self.tenantid.to_string(); + let timeline_id = self.timelineid.to_string(); + match cconf { CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint"]) + .with_label_values(&["flush checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(0, false)), CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint"]) + .with_label_values(&["forced checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(0, true)), CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint"]) + .with_label_values(&["checkpoint", &tenant_id, &timeline_id]) .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..42a099cca5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -298,7 +298,7 @@ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( "pageserver_smgr_query_time", "Time spent on smgr query handling", - &["smgr_query_type"], + &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() ) .expect("failed to define a metric"); @@ -340,20 +340,22 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_id = tenantid.to_string(); + let timeline_id = timelineid.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) + .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_rel_exists_request(timeline.as_ref(), &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) + .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_nblocks_request(timeline.as_ref(), &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) + .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), From 27dadba52c7543b9bd49b8c506fa74a1587df543 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Fri, 25 Feb 2022 14:22:48 -0800 Subject: [PATCH 23/55] Fix retain references to layer histograms Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 80 ++++++++++++++++++---------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 63ade9bb37..a6e61cb9e0 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,7 +47,7 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; @@ -247,19 +247,15 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { - if let Some(timeline_id) = target_timelineid { - STORAGE_TIME - .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_id.to_string()]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) - } else { - STORAGE_TIME - .with_label_values(&["gc", &self.tenantid.to_string(), "-"]) - .observe_closure_duration(|| { - self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) - }) - } + let timeline_str = target_timelineid + .map(|x| x.to_string()) + .unwrap_or_else(|| "-".to_string()); + + STORAGE_TIME + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) + .observe_closure_duration(|| { + self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) + }) } fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { @@ -788,6 +784,12 @@ pub struct LayeredTimeline { // ordering for its operations, but involves private modules, and macro trickery current_logical_size_gauge: IntGauge, + // Metrics histograms + reconstruct_time_histo: Histogram, + checkpoint_time_histo: Histogram, + flush_checkpoint_time_histo: Histogram, + forced_checkpoint_time_histo: Histogram, + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -866,11 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - let tenant_id = self.tenantid.to_string(); - let timeline_id = self.timelineid.to_string(); - - RECONSTRUCT_TIME - .with_label_values(&[&tenant_id, &timeline_id]) + self.reconstruct_time_histo .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes @@ -1020,18 +1018,15 @@ impl Timeline for LayeredTimeline { /// checkpoint_internal function, this public facade just wraps it for /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { - let tenant_id = self.tenantid.to_string(); - let timeline_id = self.timelineid.to_string(); - match cconf { - CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Flush => self + .flush_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Forced => self + .forced_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint", &tenant_id, &timeline_id]) + CheckpointConfig::Distance(distance) => self + .checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } @@ -1130,6 +1125,31 @@ impl LayeredTimeline { let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); + let checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let flush_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "flush checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let forced_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "forced checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + LayeredTimeline { conf, timelineid, @@ -1149,6 +1169,10 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), current_logical_size: AtomicUsize::new(current_logical_size), current_logical_size_gauge, + reconstruct_time_histo, + checkpoint_time_histo, + flush_checkpoint_time_histo, + forced_checkpoint_time_histo, upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), From b2ad8342d21521226160416efe8e330cf1655852 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Mon, 28 Feb 2022 16:37:09 -0800 Subject: [PATCH 24/55] Add zid stringify bench test Signed-off-by: Dhammika Pathirana --- zenith_utils/Cargo.toml | 5 +++++ zenith_utils/benches/benchmarks.rs | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 zenith_utils/benches/benchmarks.rs diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index b22fcbf748..daaf345f8f 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -37,3 +37,8 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" webpki = "0.21" +criterion = "0.3" + +[[bench]] +name = "benchmarks" +harness = false diff --git a/zenith_utils/benches/benchmarks.rs b/zenith_utils/benches/benchmarks.rs new file mode 100644 index 0000000000..c945d5021c --- /dev/null +++ b/zenith_utils/benches/benchmarks.rs @@ -0,0 +1,22 @@ +#![allow(unused)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use zenith_utils::zid; + +pub fn bench_zid_stringify(c: &mut Criterion) { + // Can only use public methods. + let ztl = zid::ZTenantTimelineId::generate(); + + c.bench_function("zid.to_string", |b| { + b.iter(|| { + // FIXME measurement overhead? + //for _ in 0..1000 { + // ztl.tenant_id.to_string(); + //} + ztl.tenant_id.to_string(); + }) + }); +} + +criterion_group!(benches, bench_zid_stringify); +criterion_main!(benches); From a8a7dc9ca65352ad738e55a3a26a7171a89db17b Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Tue, 1 Mar 2022 14:28:25 -0800 Subject: [PATCH 25/55] Fix zid encoding Signed-off-by: Dhammika Pathirana --- zenith_utils/src/zid.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index a740d4fb48..e047e38da7 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -112,6 +112,17 @@ impl ZId { rand::thread_rng().fill(&mut tli_buf); ZId::from(tli_buf) } + + fn hex_encode(&self) -> String { + static HEX: &[u8] = b"0123456789abcdef"; + + let mut buf = vec![0u8; self.0.len() * 2]; + for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { + chunk[0] = HEX[((b >> 4) & 0xf) as usize]; + chunk[1] = HEX[(b & 0xf) as usize]; + } + unsafe { String::from_utf8_unchecked(buf) } + } } impl FromStr for ZId { @@ -147,13 +158,13 @@ impl From<[u8; 16]> for ZId { impl fmt::Display for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } impl fmt::Debug for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } From 5d7bd8643ade07d0e8a1f2ee8c9b535336b65e90 Mon Sep 17 00:00:00 2001 From: Dhammika Pathirana Date: Wed, 2 Mar 2022 14:50:22 -0800 Subject: [PATCH 26/55] Fix page reconstruct time histo Signed-off-by: Dhammika Pathirana --- pageserver/src/layered_repository.rs | 29 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index a6e61cb9e0..9e0df5dab2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -94,7 +94,7 @@ lazy_static! { lazy_static! { static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations", + "Time spent on storage operations", &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); @@ -868,8 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - self.reconstruct_time_histo - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) + self.materialize_page(seg, seg_blknum, lsn, &*layer) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 @@ -2022,17 +2021,19 @@ impl LayeredTimeline { let mut layer_ref = layer; let mut curr_lsn = lsn; loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; + let result = self.reconstruct_time_histo.observe_closure_duration(|| { + layer_ref + .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) + .with_context(|| { + format!( + "Failed to get reconstruct data {} {:?} {} {}", + layer_ref.get_seg_tag(), + layer_ref.filename(), + seg_blknum, + curr_lsn, + ) + }) + })?; match result { PageReconstructResult::Complete => break, PageReconstructResult::Continue(cont_lsn) => { From bce2da4e5542b2d42b84cb7e6f45740aac7bdade Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 11 Mar 2022 00:53:46 +0200 Subject: [PATCH 27/55] Another 'tablespace' test fix. --- pageserver/src/basebackup.rs | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 64e9fe567e..7882e7b2b2 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -202,7 +202,24 @@ impl<'a> Basebackup<'a> { warn!("global/pg_filenode.map is missing"); } } else { - // User defined tablespaces are not supported + // User defined tablespaces are not supported. However, as + // a special case, if a tablespace/db directory is + // completely empty, we can leave it out altogether. This + // makes taking a base backup after the 'tablespace' + // regression test pass, because the test drops the + // created tablespaces after the tests. + // + // FIXME: this wouldn't be necessary, if we handled + // XLOG_TBLSPC_DROP records. But we probably should just + // throw an error on CREATE TABLESPACE in the first place. + if !has_relmap_file + && self + .timeline + .list_rels(spcnode, dbnode, self.lsn)? + .is_empty() + { + return Ok(()); + } assert!(spcnode == pg_constants::DEFAULTTABLESPACE_OID); // Append dir path for each database From d5b8380dae07ea187999e2b9eb2b6e8d929eadaa Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 11 Mar 2022 09:47:09 +0200 Subject: [PATCH 28/55] Improve comments on image layer. Make it more explicit that if a key doesn't exist in an image layer, it doesn't exist. --- pageserver/src/layered_repository/image_layer.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 2ce57e9940..550bcda8f7 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -1,8 +1,11 @@ -//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN. -//! It is stored in a file on disk. +//! An ImageLayer represents an image or a snapshot of a key-range at +//! one particular LSN. It contains an image of all key-value pairs +//! in its key-range. Any key that falls into the image layer's range +//! but does not exist in the layer, does not exist. //! -//! On disk, the image files are stored in timelines/ directory. -//! Currently, there are no subdirectories, and each image layer file is named like this: +//! An image layer is stored in a file on disk. The file is stored in +//! timelines/ directory. Currently, there are no +//! subdirectories, and each image layer file is named like this: //! //! -__ //! From ee40297758a581faadfe83de19dc807bc4ffb2d7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 11 Mar 2022 16:24:13 +0200 Subject: [PATCH 29/55] Refactor keyspace code Have separate classes for the KeySpace, a partitioning of the KeySpace (KeyPartitioning), and a builder object used to construct the KeySpace. Previously, KeyPartitioning did all those things, and it was a bit confusing. --- pageserver/src/keyspace.rs | 129 +++++++++++++++++---------- pageserver/src/layered_repository.rs | 22 +++-- pageserver/src/pgdatadir_mapping.rs | 12 +-- 3 files changed, 103 insertions(+), 60 deletions(-) diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index d2633b573e..274c858338 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -1,30 +1,101 @@ +use crate::repository::{key_range_size, singleton_range, Key}; +use postgres_ffi::pg_constants; use std::ops::Range; -use crate::repository::{key_range_size, singleton_range, Key}; - -use postgres_ffi::pg_constants; - -// Target file size, when creating iage and delta layers +// Target file size, when creating image and delta layers pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB /// /// Represents a set of Keys, in a compact form. /// -#[derive(Debug, Clone)] -pub struct KeyPartitioning { - accum: Option>, - +pub struct KeySpace { + // Contiguous ranges of keys that belong to the key space. In key order, and + // with no overlap. ranges: Vec>, +} +impl KeySpace { + /// + /// Partition a key space into roughly chunks of roughly 'target_size' bytes in + /// each patition. + /// + pub fn partition(&self, target_size: u64) -> KeyPartitioning { + // Assume that each value is 8k in size. + let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + + let mut partitions = Vec::new(); + let mut current_part = Vec::new(); + let mut current_part_size: usize = 0; + for range in &self.ranges { + // If appending the next contiguous range in the keyspace to the current + // partition would cause it to be too large, start a new partition. + let this_size = key_range_size(range) as usize; + if current_part_size + this_size > target_nblocks && !current_part.is_empty() { + partitions.push(current_part); + current_part = Vec::new(); + current_part_size = 0; + } + + // If the next range is larger than 'target_size', split it into + // 'target_size' chunks. + let mut remain_size = this_size; + let mut start = range.start; + while remain_size > target_nblocks { + let next = start.add(target_nblocks as u32); + partitions.push(vec![start..next]); + start = next; + remain_size -= target_nblocks + } + current_part.push(start..range.end); + current_part_size += remain_size; + } + + // add last partition that wasn't full yet. + if !current_part.is_empty() { + partitions.push(current_part); + } + + KeyPartitioning { partitions } + } +} + +/// +/// Represents a partitioning of the key space. +/// +/// The only kind of partitioning we do is to partition the key space into +/// partitions that are roughly equal in physical size (see KeySpace::partition). +/// But this data structure could represent any partitioning. +/// +#[derive(Clone, Debug, Default)] +pub struct KeyPartitioning { pub partitions: Vec>>, } impl KeyPartitioning { pub fn new() -> Self { KeyPartitioning { + partitions: Vec::new(), + } + } +} + +/// +/// A helper object, to collect a set of keys and key ranges into a KeySpace +/// object. This takes care of merging adjacent keys and key ranges into +/// contiguous ranges. +/// +#[derive(Clone, Debug, Default)] +pub struct KeySpaceAccum { + accum: Option>, + + ranges: Vec>, +} + +impl KeySpaceAccum { + pub fn new() -> Self { + Self { accum: None, ranges: Vec::new(), - partitions: Vec::new(), } } @@ -47,44 +118,12 @@ impl KeyPartitioning { } } - pub fn repartition(&mut self, target_size: u64) { - let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; + pub fn to_keyspace(mut self) -> KeySpace { if let Some(accum) = self.accum.take() { self.ranges.push(accum); } - - self.partitions = Vec::new(); - - let mut current_part = Vec::new(); - let mut current_part_size: usize = 0; - for range in &self.ranges { - let this_size = key_range_size(range) as usize; - - if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - self.partitions.push(current_part); - current_part = Vec::new(); - current_part_size = 0; - } - - let mut remain_size = this_size; - let mut start = range.start; - while remain_size > target_nblocks { - let next = start.add(target_nblocks as u32); - self.partitions.push(vec![start..next]); - start = next; - remain_size -= target_nblocks - } - current_part.push(start..range.end); - current_part_size += remain_size; - } - if !current_part.is_empty() { - self.partitions.push(current_part); + KeySpace { + ranges: self.ranges, } } } - -impl Default for KeyPartitioning { - fn default() -> Self { - Self::new() - } -} diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index b4bdd7eff8..e64bcde6a7 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1910,6 +1910,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { #[cfg(test)] mod tests { use super::*; + use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; use rand::thread_rng; use rand::Rng; @@ -2009,7 +2010,7 @@ mod tests { let mut lsn = Lsn(0x10); - let mut parts = KeyPartitioning::new(); + let mut keyspace = KeySpaceAccum::new(); let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); let mut blknum = 0; @@ -2025,14 +2026,17 @@ mod tests { writer.advance_last_record_lsn(lsn); drop(writer); - parts.add_key(test_key); + keyspace.add_key(test_key); lsn = Lsn(lsn.0 + 0x10); blknum += 1; } let cutoff = tline.get_last_record_lsn(); - parts.repartition(TEST_FILE_SIZE as u64); + let parts = keyspace + .clone() + .to_keyspace() + .partition(TEST_FILE_SIZE as u64); tline.hint_partitioning(parts.clone(), lsn)?; tline.update_gc_info(Vec::new(), cutoff); @@ -2053,7 +2057,7 @@ mod tests { let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); - let mut parts = KeyPartitioning::new(); + let mut keyspace = KeySpaceAccum::new(); // Track when each page was last modified. Used to assert that // a read sees the latest page version. @@ -2074,10 +2078,10 @@ mod tests { updated[blknum] = lsn; drop(writer); - parts.add_key(test_key); + keyspace.add_key(test_key); } - parts.repartition(TEST_FILE_SIZE as u64); + let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); tline.hint_partitioning(parts, lsn)?; for _ in 0..50 { @@ -2127,7 +2131,7 @@ mod tests { let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap(); - let mut parts = KeyPartitioning::new(); + let mut keyspace = KeySpaceAccum::new(); // Track when each page was last modified. Used to assert that // a read sees the latest page version. @@ -2148,10 +2152,10 @@ mod tests { updated[blknum] = lsn; drop(writer); - parts.add_key(test_key); + keyspace.add_key(test_key); } - parts.repartition(TEST_FILE_SIZE as u64); + let parts = keyspace.to_keyspace().partition(TEST_FILE_SIZE as u64); tline.hint_partitioning(parts, lsn)?; let mut tline_id = TIMELINE_ID; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 8fd3e44991..43876760a3 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,7 +7,7 @@ //! Clarify that) //! -use crate::keyspace::{KeyPartitioning, TARGET_FILE_SIZE_BYTES}; +use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; use crate::relish::*; use crate::repository::*; use crate::repository::{Repository, Timeline}; @@ -336,9 +336,9 @@ impl DatadirTimeline { Ok(total_size * pg_constants::BLCKSZ as usize) } - fn collect_keyspace(&self, lsn: Lsn) -> Result { + fn collect_keyspace(&self, lsn: Lsn) -> Result { // Iterate through key ranges, greedily packing them into partitions - let mut result = KeyPartitioning::new(); + let mut result = KeySpaceAccum::new(); // Add dbdir result.add_key(DBDIR_KEY); @@ -404,7 +404,7 @@ impl DatadirTimeline { result.add_key(CONTROLFILE_KEY); result.add_key(CHECKPOINT_KEY); - Ok(result) + Ok(result.to_keyspace()) } } @@ -801,8 +801,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { if last_partitioning == Lsn(0) || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 { - let mut partitioning = self.tline.collect_keyspace(self.lsn)?; - partitioning.repartition(TARGET_FILE_SIZE_BYTES); + let keyspace = self.tline.collect_keyspace(self.lsn)?; + let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); self.tline.tline.hint_partitioning(partitioning, self.lsn)?; self.tline.last_partitioning.store(self.lsn); } From d93fc371f348919ed728fd8539f34e4d0a270f9d Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 11 Mar 2022 18:49:36 +0200 Subject: [PATCH 30/55] Import all existing RFCs documents from the separate 'rfcs' repository. --- docs/rfcs/002-storage.md | 186 ++++++++++++ docs/rfcs/003-laptop-cli.md | 267 ++++++++++++++++++ docs/rfcs/004-durability.md | 218 ++++++++++++++ docs/rfcs/005-zenith_local.md | 103 +++++++ docs/rfcs/006-laptop-cli-v2-CLI.md | 64 +++++ .../006-laptop-cli-v2-repository-structure.md | 140 +++++++++ docs/rfcs/007-serverless-on-laptop.md | 93 ++++++ docs/rfcs/008-push-pull.md | 66 +++++ docs/rfcs/009-snapshot-first-storage-cli.md | 56 ++++ docs/rfcs/009-snapshot-first-storage-pitr.md | 227 +++++++++++++++ docs/rfcs/009-snapshot-first-storage.md | 148 ++++++++++ docs/rfcs/010-storage_details.md | 144 ++++++++++ docs/rfcs/011-retention-policy.md | 91 ++++++ docs/rfcs/012-background-tasks.md | 38 +++ docs/rfcs/013-term-history.md | 147 ++++++++++ docs/rfcs/README.md | 95 +++++++ docs/rfcs/images/storage.jpeg | Bin 0 -> 431075 bytes 17 files changed, 2083 insertions(+) create mode 100644 docs/rfcs/002-storage.md create mode 100644 docs/rfcs/003-laptop-cli.md create mode 100644 docs/rfcs/004-durability.md create mode 100644 docs/rfcs/005-zenith_local.md create mode 100644 docs/rfcs/006-laptop-cli-v2-CLI.md create mode 100644 docs/rfcs/006-laptop-cli-v2-repository-structure.md create mode 100644 docs/rfcs/007-serverless-on-laptop.md create mode 100644 docs/rfcs/008-push-pull.md create mode 100644 docs/rfcs/009-snapshot-first-storage-cli.md create mode 100644 docs/rfcs/009-snapshot-first-storage-pitr.md create mode 100644 docs/rfcs/009-snapshot-first-storage.md create mode 100644 docs/rfcs/010-storage_details.md create mode 100644 docs/rfcs/011-retention-policy.md create mode 100644 docs/rfcs/012-background-tasks.md create mode 100644 docs/rfcs/013-term-history.md create mode 100644 docs/rfcs/README.md create mode 100644 docs/rfcs/images/storage.jpeg diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md new file mode 100644 index 0000000000..5cac377272 --- /dev/null +++ b/docs/rfcs/002-storage.md @@ -0,0 +1,186 @@ +# Zenith storage node — alternative + +## **Design considerations** + +Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. + +Proposed architecture addresses: + +- High availability -- tolerates n/2 - 1 failures +- Multi-tenancy -- one storage for all databases +- Elasticity -- increase storage size on the go by adding nodes +- Snapshots / backups / PITR with S3 offload +- Compression + +Minuses are: + +- Quite a lot of work +- Single page access may touch few disk pages +- Some bloat in data — may slowdown sequential scans + +## **Summary** + +Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: + +``` + +|-chunk_42/ + |-store/ -- contains lsm with pages/pagediffs ranging from + | page_key_lo to page_key_hi + |-wal/ + | |- db_1234/ db-specific wal files with pages from page_key_lo + | to page_key_hi + | + |-chunk.meta -- small file with snapshot references + (page_key_prefix+lsn+name) + and PITR regions (page_key_start, page_key_end) +``` + +## **Chunk** + +Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: + +- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) +- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance +- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later +- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. +- `(forkno, segno, pageno)` -- page coordinates in postgres data files +- `lsn_timeline` -- postgres feature, increments when PITR was done. +- `lsn` -- lsn of current page version. + +Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. + +LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. + +Content of SSTable can be following: + +```jsx +(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) +(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) +``` + +So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. + +### **Page deletion** + +To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. + +### **Recovery** + +Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. + +### **Checkpointing** + +No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. + +### **Full page writes (torn page protection)** + +Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. + +### **Snapshot** + +That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. + +It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. + +Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. + +**Starting db from snapshot** + +When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). + +**Snapshot export/import** + +Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. + +### **PITR area** + +In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. + +PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. + +### **Compression** + +Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. + +### **Chunk metadata** + +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. + +### **Chunk splitting** + +*(NB: following paragraph is about how to avoid page splitting)* + +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: + +1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. + +2. Prohibit WAL deletion and old SSTables deletion on original chunk. + +3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. + +4. Start WAL replay on new chunks. + +5. Update global metadata about new chunk boundaries. + +6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. + +7. New chunk may start serving read queries when following conditions are met: + +a) it receives at least on WAL record from processing node + +b) it replayed all WAL up to the new received one + +c) checked by downlinks that there were no WAL gaps. + +Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. + +### Fixed chunks + +Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. + +### **Chunk lsm internals** + +So how to implement chunk's lsm? + +- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. +- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. + +Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. + +Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. + +# Storage fleet + +# **Storage fleet** + +- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). + +Screenshot_2021-02-22_at_16 49 17 + +Few databases are stored in one chunk, replicated three times + +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. + +Screenshot_2021-02-22_at_16 49 10 + +Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. + +## **Chunk placement strategies** + +There are few scenarios where we may want to move chunks around the cluster: + +- disk usage on some node is big +- some disk experienced a failure +- some node experienced a failure or need maintenance + +## **Chunk replication** + +Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. + diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md new file mode 100644 index 0000000000..4d1f0a68f0 --- /dev/null +++ b/docs/rfcs/003-laptop-cli.md @@ -0,0 +1,267 @@ +# Command line interface (end-user) + +Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. + +This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. + +The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. + +# Possible usage scenarios + +## Install zenith, run a postgres + +``` +> brew install pg-zenith +> zenith pg create # creates pgdata with default pattern pgdata$i +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 0G zenith-local localhost:5432 +``` + +## Import standalone postgres to zenith + +``` +> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +[====================------------] 60% | 20MB/s +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - + +> zenith pg create --snapshot oldpg +Started postgres on localhost:5432 + +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot destroy oldpg +Ok +``` + +Also, we may start snapshot import implicitly by looking at snapshot schema + +``` +> zenith pg create --snapshot basebackup://replication@localhost:5432/ +Downloading snapshot... Done. +Started postgres on localhost:5432 +Destroying snapshot... Done. +``` + +## Pull snapshot with some publicly shared database + +Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). + +``` +> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +``` + +## Create snapshot and push it to the cloud + +``` +> zenith snapshot create pgdata1@snap1 +> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +``` + +## Rollback database to the snapshot + +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. + +``` +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot create pgdata1@snap1 + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@CURRENT 6G - + +> zenith pg checkout pgdata1@snap1 +Stopping postgres on pgdata1. +Rolling back pgdata1@CURRENT to pgdata1@snap1. +Starting postgres on pgdata1. + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@HEAD{0} 6G - +pgdata1@CURRENT 6G - +``` + +Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). + +## Configure PITR area (Point In Time Recovery). + +PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). + +``` +> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +``` + +Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. + +# Manual + +## storage + +Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. + +**zenith storage attach** -t [native|s3] -c key=value -n name + +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. + + +**zenith storage list** + +Show currently attached storages. For example: + +``` +> zenith storage list +NAME USED TYPE OPTIONS PATH +local 5.1G zenith-local /opt/zenith/store/local +local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +zcloud 60G zenith-remote zenith.tech/stas/mystore +s3tank 80G S3 +``` + +**zenith storage detach** + +**zenith storage show** + + + +## pg + +Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. + +Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. + +**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata + +Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. + +--no-start: just init datadir without creating + +--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) + +--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) + +**zenith pg destroy** + +**zenith pg start** [--replica] pgdata + +Start postgres with proper extensions preloaded/installed. + +**zenith pg checkout** + +Rollback data directory to some previous snapshot. + +**zenith pg stop** pg_id + +**zenith pg list** + +``` +ROLE PGDATA USED STORAGE ENDPOINT +primary my_pg 5.1G local localhost:5432 +replica-1 localhost:5433 +replica-2 localhost:5434 +primary my_pg2 3.2G local.compr localhost:5435 +- my_pg3 9.2G local.compr - +``` + +**zenith pg show** + +``` +my_pg: + storage: local + space used on local: 5.1G + space used on all storages: 15.1G + snapshots: + on local: + snap1: 1G + snap2: 1G + on zcloud: + snap2: 1G + on s3tank: + snap5: 2G + pitr: + on s3tank: + pitr_one_month: 45G + +``` + +**zenith pg start-rest/graphql** pgdata + +Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. + + +## snapshot + +Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. + +**zenith snapshot create** pgdata_name@snap_name + +Creates a new snapshot in the same storage where pgdata_name exists. + +**zenith snapshot push** --to url pgdata_name@snap_name + +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. + +**zenith snapshot recv** + +Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. + +**zenith snapshot pull** --from url or path + +Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. + +**zenith snapshot import** --from basebackup://<...> or path + +Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. + +**zenith snapshot export** + +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). + +**zenith snapshot diff** snap1 snap2 + +Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. + +**zenith snapshot destroy** + +## pitr + +Pitr represents wal stream and ttl policy for that stream + +XXX: any suggestions on a better name? + +**zenith pitr create** name + +--ttl = inf | period + +--size-limit = inf | limit + +--storage = storage_name + +**zenith pitr extract-snapshot** pitr_name --lsn xxx + +Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) + +**zenith pitr gc** pitr_name + +Force garbage collection on some PITR area. + +**zenith pitr list** + +**zenith pitr destroy** + + +## console + +**zenith console** + +Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md new file mode 100644 index 0000000000..4543be3dae --- /dev/null +++ b/docs/rfcs/004-durability.md @@ -0,0 +1,218 @@ +Durability & Consensus +====================== + +When a transaction commits, a commit record is generated in the WAL. +When do we consider the WAL record as durable, so that we can +acknowledge the commit to the client and be reasonably certain that we +will not lose the transaction? + +Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +A WAL record is considered durable, when it has been written to a +majority of WAL safekeeper nodes. In this document, I use 5 +safekeepers, because I have five fingers. A WAL record is durable, +when at least 3 safekeepers have written it to disk. + +First, assume that only one primary node can be running at a +time. This can be achieved by Kubernetes or etcd or some +cloud-provider specific facility, or we can implement it +ourselves. These options are discussed in later chapters. For now, +assume that there is a Magic STONITH Fairy that ensures that. + +In addition to the WAL safekeeper nodes, the WAL is archived in +S3. WAL that has been archived to S3 can be removed from the +safekeepers, so the safekeepers don't need a lot of disk space. + + + +----------------+ + +-----> | WAL safekeeper | + | +----------------+ + | +----------------+ + +-----> | WAL safekeeper | ++------------+ | +----------------+ +| Primary | | +----------------+ +| Processing | ---------+-----> | WAL safekeeper | +| Node | | +----------------+ ++------------+ | +----------------+ + \ +-----> | WAL safekeeper | + \ | +----------------+ + \ | +----------------+ + \ +-----> | WAL safekeeper | + \ +----------------+ + \ + \ + \ + \ + \ +--------+ + \ | | + +--> | S3 | + | | + +--------+ + + +Every WAL safekeeper holds a section of WAL, and a VCL value. +The WAL can be divided into three portions: + + + VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Note that all this WAL kept in a safekeeper is a contiguous section. +This is different from Aurora: In Aurora, there can be holes in the +WAL, and there is a Gossip protocol to fill the holes. That could be +implemented in the future, but let's keep it simple for now. WAL needs +to be written to a safekeeper in order. However, during crash +recovery, In-flight WAL that has already been stored in a safekeeper +can be truncated or overwritten. + +The Archived WAL has already been stored in S3, and can be removed from +the safekeeper. + +The Completed WAL has been written to at least three safekeepers. The +algorithm ensures that it is not lost, when at most two nodes fail at +the same time. + +The In-flight WAL has been persisted in the safekeeper, but if a crash +happens, it may still be overwritten or truncated. + + +The VCL point is determined in the Primary. It is not strictly +necessary to store it in the safekeepers, but it allows some +optimizations and sanity checks and is probably generally useful for +the system as whole. The VCL values stored in the safekeepers can lag +behind the VCL computed by the primary. + + +Primary node Normal operation +----------------------------- + +1. Generate some WAL. + +2. Send the WAL to all the safekeepers that you can reach. + +3. As soon as a quorum of safekeepers have acknowledged that they have + received and durably stored the WAL up to that LSN, update local VCL + value in memory, and acknowledge commits to the clients. + +4. Send the new VCL to all the safekeepers that were part of the quorum. + (Optional) + + +Primary Crash recovery +---------------------- + +When a new Primary node starts up, before it can generate any new WAL +it needs to contact a majority of the WAL safekeepers to compute the +VCL. Remember that there is a Magic STONITH fairy that ensures that +only node process can be doing this at a time. + +1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you + can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. + +2. Update the other safekeepers you can reach, by copying all the WAL + from the Winner, starting from each safekeeper's old VCL point. Any old + In-Flight WAL from previous Epoch is truncated away. + +3. Increment Epoch, and send the new Epoch to the quorum of + safekeepers. (This ensures that if any of the safekeepers that we + could not reach later come back online, they will be considered as + older than this in any future recovery) + +You can now start generating new WAL, starting from the newly-computed +VCL. + +Optimizations +------------- + +As described, the Primary node sends all the WAL to all the WAL safekeepers. That +can be a lot of network traffic. Instead of sending the WAL directly from Primary, +some safekeepers can be daisy-chained off other safekeepers, or there can be a +broadcast mechanism among them. There should still be a direct connection from the +each safekeeper to the Primary for the acknowledgments though. + +Similarly, the responsibility for archiving WAL to S3 can be delegated to one of +the safekeepers, to reduce the load on the primary. + + +Magic STONITH fairy +------------------- + +Now that we have a system that works as long as only one primary node is running at a time, how +do we ensure that? + +1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary + when it's holding a valid lease. If the primary node dies, the lease expires after a timeout + period, and a new node is allowed to become the primary. + +2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you + cannot do this safely. In practice, it would probably be OK if you make the lease times and + timeouts long enough. This has the advantage that we don't need to introduce a new + component to the architecture. + +3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The + next chapter describes this option. + + +Built-in Paxos +-------------- + +The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes +as both Proposers and Learners. + +Each WAL safekeeper holds an Epoch value in addition to the VCL and +the WAL. Each request by the primary to safekeep WAL is accompanied by +an Epoch value. If a safekeeper receives a request with Epoch that +doesn't match its current Accepted Epoch, it must ignore (NACK) it. +(In different Paxos papers, Epochs are called "terms" or "round +numbers") + +When a node wants to become the primary, it generates a new Epoch +value that is higher than any previously observed Epoch value, and +globally unique. + + +Accepted Epoch: 555 VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Primary node startup: + +1. Contact all WAL safekeepers that you can reach (if you cannot + connect to a quorum of them, you can give up immediately). Find the + latest Epoch among them. + +2. Generate a new globally unique Epoch, greater than the latest Epoch + found in previous step. + +2. Send the new Epoch in a Prepare message to a quorum of + safekeepers. (PAXOS Prepare message) + +3. Each safekeeper responds with a Promise. If a safekeeper has + already made a promise with a higher Epoch, it doesn't respond (or + responds with a NACK). After making a promise, the safekeeper stops + responding to any write requests with earlier Epoch. + +4. Once you have received a majority of promises, you know that the + VCL cannot advance on the old Epoch anymore. This effectively kills + any old primary server. + +5. Find the highest written LSN among the quorum of safekeepers (these + can be included in the Promise messages already). This is the new + VCL. If a new node starts the election process after this point, + it will compute the same or higher VCL. + +6. Copy the WAL from the safekeeper with the highest LSN to the other + safekeepers in the quorum, using the new Epoch. (PAXOS Accept + phase) + +7. You can now start generating new WAL starting from the VCL. If + another process starts the election process after this point and + gains control of a majority of the safekeepers, we will no longer + be able to advance the VCL. + diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md new file mode 100644 index 0000000000..7b078e9ec0 --- /dev/null +++ b/docs/rfcs/005-zenith_local.md @@ -0,0 +1,103 @@ +# Zenith local + +Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. + +#### Why do we need it? +- For distribution - this easy to use binary will help us to build adoption among developers. +- For internal use - to test all components together. + +In my understanding, we consider it to be just a mock-up version of zenith-cloud. +> Question: How much should we care about durability and security issues for a local setup? + + +#### Why is it better than a simple local postgres? + +- Easy one-line setup. As simple as `cargo install zenith && zenith start` + +- Quick and cheap creation of compute nodes over the same storage. +> Question: How can we describe a use-case for this feature? + +- Zenith-local can work with S3 directly. + +- Push and pull images (snapshots) to remote S3 to exchange data with other users. + +- Quick and cheap snapshot checkouts to switch back and forth in the database history. +> Question: Do we want it in the very first release? This feature seems quite complicated. + +#### Distribution: + +Ideally, just one binary that incorporates all elements we need. +> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. + +#### Components: + +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli + +- **zenith-console** - WEB UI with same functionality as CLI. +>Note: not for the first release. + +- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. + +- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? + +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src + +- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? +> Question: Do we use it together with local page store or they are interchangeable? + +WIP code is ??? + +- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. + +WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper + +- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. + + WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + +#### REST API: + +Service endpoint: `http://localhost:3000` + +Resources: +- /storages - Where data lives: zenith-pageserver or zenith-s3 +- /pgs - Postgres - zenith-computenode +- /snapshots - snapshots **TODO** + +>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? + +Methods and their mapping to CLI: + +- /storages - zenith-pageserver or zenith-s3 + +CLI | REST API +------------- | ------------- +storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages +storage detach -n name | DELETE /storages/:storage_name +storage list | GET /storages +storage show -n name | GET /storages/:storage_name + + +- /pgs - zenith-computenode + +CLI | REST API +------------- | ------------- +pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs +pg destroy -n name | DELETE /pgs/:pg_name +pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions +pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions +pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions +pg list | GET /pgs +pg show -n name | GET /pgs/:pg_name + +- /snapshots **TODO** + +CLI | REST API +------------- | ------------- + diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md new file mode 100644 index 0000000000..a04536922a --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -0,0 +1,64 @@ +Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". + +# CLI v2 (after chatting with Carl) + +Zenith introduces the notion of a repository. + +```bash +zenith init +zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +``` + +Once you have a cluster catalog you can explore it + +```bash +zenith log -- returns a list of commits +zenith status -- returns if there are changes in the catalog that can be committed +zenith commit -- commits the changes and generates a new commit hash +zenith branch experimental -- creates a branch called testdb based on a given commit hash +``` + +To make changes in the catalog you need to run compute nodes + +```bash +-- here is how you a compute node +zenith start /home/pipedpiper/northwind:main -- starts a compute instance +zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) + +-- After running some DML you can run +-- zenith status and see how there are two WAL streams one on top of +-- the main branch +zenith status +-- and another on top of the experimental branch +zenith status -b experimental + +-- you can commit each branch separately +zenith commit main +-- or +zenith commit -c /home/pipedpiper/northwind:experimental +``` + +Starting compute instances against cloud environments + +```bash +-- you can start a compute instance against the cloud environment +-- in this case all of the changes will be streamed into the cloud +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith status -c https://zenith:tech/pipedpiper/northwind:main +zenith commit -c https://zenith:tech/pipedpiper/northwind:main +zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +``` + +Pushing data into the cloud + +```bash +-- pull all the commits from the cloud +zenith pull +-- push all the commits to the cloud +zenith push +``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md new file mode 100644 index 0000000000..ee4e432182 --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -0,0 +1,140 @@ +# Repository format + +A Zenith repository is similar to a traditional PostgreSQL backup +archive, like a WAL-G bucket or pgbarman backup catalogue. It holds +multiple versions of a PostgreSQL database cluster. + +The distinguishing feature is that you can launch a Zenith Postgres +server directly against a branch in the repository, without having to +"restore" it first. Also, Zenith manages the storage automatically, +there is no separation between full and incremental backups nor WAL +archive. Zenith relies heavily on the WAL, and uses concepts similar +to incremental backups and WAL archiving internally, but it is hidden +from the user. + +## Directory structure, version 1 + +This first version is pretty straightforward but not very +efficient. Just something to get us started. + +The repository directory looks like this: + + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + + .zenith/refs/branches/mybranch + .zenith/refs/tags/foo + .zenith/refs/tags/bar + + .zenith/datadirs/ + +### Timelines + +A timeline is similar to PostgeSQL's timeline, but is identified by a +UUID instead of a 32-bit timeline Id. For user convenience, it can be +given a name that refers to the UUID (called a branch). + +All WAL is generated on a timeline. You can launch a read-only node +against a tag or arbitrary LSN on a timeline, but in order to write, +you need to create a timeline. + +Each timeline is stored in a directory under .zenith/timelines. It +consists of a WAL archive, containing all the WAL in the standard +PostgreSQL format, under the wal/ subdirectory. + +The 'snapshots/' subdirectory, contains "base backups" of the data +directory at a different LSNs. Each snapshot is simply a copy of the +Postgres data directory. + +When a new timeline is forked from a previous timeline, the ancestor +timeline's UUID is stored in the 'history' file. + +### Refs + +There are two kinds of named objects in the repository: branches and +tags. A branch is a human-friendly name for a timeline UUID, and a +tag is a human-friendly name for a specific LSN on a timeline +(timeline UUID + LSN). Like in git, these are just for user +convenience; you can also use timeline UUIDs and LSNs directly. + +Refs do have one additional purpose though: naming a timeline or LSN +prevents it from being automatically garbage collected. + +The refs directory contains a small text file for each tag/branch. It +contains the UUID of the timeline (and LSN, for tags). + +### Datadirs + +.zenith/datadirs contains PostgreSQL data directories. You can launch +a Postgres instance on one of them with: + +``` + postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c +``` + +All the actual data is kept in the timeline directories, under +.zenith/timelines. The data directories are only needed for active +PostgreQSL instances. After an instance is stopped, the data directory +can be safely removed. "zenith start" will recreate it quickly from +the data in .zenith/timelines, if it's missing. + +## Version 2 + +The format described above isn't very different from a traditional +daily base backup + WAL archive configuration. The main difference is +the nicer naming of branches and tags. + +That's not very efficient. For performance, we need something like +incremental backups that don't require making a full copy of all +data. So only store modified files or pages. And instead of having to +replay all WAL from the last snapshot, "slice" the WAL into +per-relation WAL files and only recover what's needed when a table is +accessed. + +In version 2, the file format in the "snapshots" subdirectory gets +more advanced. The exact format is TODO. But it should support: +- storing WAL records of individual relations/pages +- storing a delta from an older snapshot +- compression + + +## Operations + +### Garbage collection + +When you run "zenith gc", old timelines that are no longer needed are +removed. That involves collecting the list of "unreachable" objects, +starting from the named branches and tags. + +Also, if enough WAL has been generated on a timeline since last +snapshot, a new snapshot or delta is created. + +### zenith push/pull + +Compare the tags and branches on both servers, and copy missing ones. +For each branch, compare the timeline it points to in both servers. If +one is behind the other, copy the missing parts. + +FIXME: how do you prevent confusion if you have to clones of the same +repository, launch an instance on the same branch in both clones, and +later try to push/pull between them? Perhaps create a new timeline +every time you start up an instance? Then you would detect that the +timelines have diverged. That would match with the "epoch" concept +that we have in the WAL safekeepr + +### zenith checkout/commit + +In this format, there is no concept of a "working tree", and hence no +concept of checking out or committing. All modifications are done on +a branch or a timeline. As soon as you launch a server, the changes are +appended to the timeline. + +You can easily fork off a temporary timeline to emulate a "working tree". +You can later remove it and have it garbage collected, or to "commit", +re-point the branch to the new timeline. + +If we want to have a worktree and "zenith checkout/commit" concept, we can +emulate that with a temporary timeline. Create the temporary timeline at +"zenith checkout", and have "zenith commit" modify the branch to point to +the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md new file mode 100644 index 0000000000..e6355f4a03 --- /dev/null +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -0,0 +1,93 @@ +How it works now +---------------- + +1. Create repository, start page server on it + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create a branch, and start a Postgres instance on it + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 + +$ zenith pg create heikki +Initializing Postgres on timeline 76cf9279915be7797095241638e64644... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 + +$ zenith pg start pg1 +Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' +waiting for server to start.... done +server started +``` + + +3. Connect to it and run queries + +``` +$ psql "dbname=postgres port=55432" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +Proposal: Serverless on your Laptop +----------------------------------- + +We've been talking about doing the "pg create" step automatically at +"pg start", to eliminate that step. What if we go further, go +serverless on your laptop, so that the workflow becomes just: + +1. Create repository, start page server on it (same as before) + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create branch + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 +``` + +3. Connect to it: + +``` +$ psql "dbname=postgres port=5432 branch=heikki" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +The trick behind the scenes is that when you launch the page server, +it starts to listen on port 5432. When you connect to it with psql, it +looks at the 'branch' parameter that you passed in the connection +string. It automatically performs the "pg create" and "pg start" steps +for that branch, and then forwards the connection to the Postgres +instance that it launched. After you disconnect, if there are no more +active connections to the server running on the branch, it can +automatically shut it down again. + +This is how serverless would work in the cloud. We can do it on your +laptop, too. diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md new file mode 100644 index 0000000000..272628e1ce --- /dev/null +++ b/docs/rfcs/008-push-pull.md @@ -0,0 +1,66 @@ +# Push and pull between pageservers + +Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. + +## Origin management + +The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). + +``` +zenith origin add +zenith origin list +zenith origin remove +``` + +Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. + +Behind the scenes, this commands may update toml file inside .zenith directory. + +## Push + +### Pushing branch + +``` +zenith push mybranch cloudserver # push to eponymous branch in cloudserver +zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +``` + +Exact mechanics would be slightly different in the following situations: + +1) Destination branch does not exist. + + That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. + + The exact mechanics may be the following: + + * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. + * local pageserver connects to the remote pageserver and runs `branch_push ` + Handler for branch_create would create destination timeline and switch connection to copyboth mode. + * Sending pageserver may start iterator on that timeline and send all the records as copy messages. + +2) Destination branch exists and latest_valid_lsn is less than ours. + + In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. + +3) Destination branch exists and latest_valid_lsn is bigger than ours. + + In this case, we can't push to that branch. We can only pull. + +### Pulling branch + +Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: + +* CLI calls `perform_pull ` on local pageserver. +* local pageserver calls `branch_pull ` on remote pageserver. +* remote pageserver sends records in our direction + +But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. + + + +[*] It looks to me that there are two different possible approaches to handling unrelated timelines: + +1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. +2) Transparently create and manage several repositories in one pageserver. + +But that is the topic for a separate RFC/discussion. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md new file mode 100644 index 0000000000..3f5386c165 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -0,0 +1,56 @@ +While working on export/import commands, I understood that they fit really well into "snapshot-first design". + +We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. + +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. + +So here is an attemt to design consistent CLI for diferent usage scenarios: + +#### 1. Start empty pageserver. +That is what we have now. +Init empty pageserver using `initdb` in temporary directory. + +`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. + +Save`storage_dest` and other parameters in config. +Push snapshots to `storage_dest` in background. + +``` +zenith init --storage_dest=S3_PREFIX +zenith start +``` + +#### 2. Restart pageserver (manually or crash-recovery). +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Push snapshots to `storage_dest` in background. + +``` +zenith start +``` + +#### 3. Import. +Start pageserver from existing snapshot. +Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` +Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. +Save`storage_dest` parameters in config. +Push snapshots to `storage_dest` in background. +``` +//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. +zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +zenith start +``` +How to pass credentials needed for `snapshot_path`? + +#### 4. Export. +Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +``` +zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +``` + +#### Notes and questions +- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- We can think of better names for all options. +- Export to plain postgres format will be useless, if we are not 100% compatible on page level. +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. \ No newline at end of file diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md new file mode 100644 index 0000000000..801613e2c9 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -0,0 +1,227 @@ +# Preface + +GetPage@LSN can be called with older LSNs, and the page server needs +to be able to reconstruct older page versions. That's needed for +having read-only replicas that lag behind the primary, or that are +"anchored" at an older LSN, and internally in the page server whne you +branch at an older point in time. How do you do that? + +For now, I'm not considering incremental snapshots at all. I don't +think that changes things. So whenever you create a snapshot or a +snapshot file, it contains an image of all the pages, there is no need +to look at an older snapshot file. + +Also, I'm imagining that this works on a per-relation basis, so that +each snapshot file contains data for one relation. A "relation" is a +fuzzy concept - it could actually be one 1 GB relation segment. Or it +could include all the different "forks" of a relation, or you could +treat each fork as a separate relation for storage purpose. And once +we have the "non-relational" work is finished, a "relation" could +actually mean some other versioned object kept in the PostgreSQL data +directory. Let's ignore that for now. + +# Eric's RFC: + +Every now and then, you create a "snapshot". It means that you create +a new snapshot file for each relation that was modified after the last +snapshot, and write out the contents the relation as it is/was at the +snapshot LSN. Write-ahead log is stored separately in S3 by the WAL +safekeeping service, in the original PostgreSQL WAL file format. + + SNAPSHOT @100 WAL + . | + . | + . | + . | + SNAPSHOT @200 | + . | + . | + . | + . | + SNAPSHOT @300 | + . | + . V + IN-MEMORY @400 + +If a GetPage@LSN request comes from the primary, you return the latest +page from the in-memory layer. If there is no trace of the page in +memory, it means that it hasn't been modified since the last snapshot, +so you return the page from the latest snapshot, at LSN 300 in the +above example. + +PITR is implemented using the original WAL files: + +If a GetPage@LSN request comes from a read replica with LSN 250, you +read the image of the page from the snapshot at LSN 200, and you also +scan the WAL between 200 and 250, and apply all WAL records for the +requested page, to reconstruct it at LSN 250. + +Scanning the WAL naively for every GetPage@LSN request would be +expensive, so in practice you'd construct an in-memory data structure +of all the WAL between 200 and 250 once that allows quickly looking up +records for a given page. + +## Problems/questions + +I think you'll need to store the list of snapshot LSNs on each +timeline somewhere. + +If the latest snapshot of a relation is at LSN 100, and you request a +page at LSN 1000000, how do you know if there are some modifications +to it between 100 and 1000000 that you need to replay? You can scan +all the WAL between 100 and 1000000, but that would be expensive. + +You can skip that, if you know that a snapshot was taken e.g. at LSN +999900. Then you know that the fact that there is no snapshot file at +999900 means that the relation hasn't been modified between +100-999900. Then you only need to scan the WAL between 999900 and +1000000. However, there is no trace of a snapshot happening at LSN +999900 in the snapshot file for this relation, so you need to get +that information from somewhere else. + +Where do you get that information from? Perhaps you can scan all the +other relations, and if you see a snapshot file for *any* relation at +LSN 999900, you know that if there were modifications to this +relation, there would be a newer snapshot file for it, too. In other +words, the list of snapshots that have been taken can be constructed +by scanning all relations and computing the union of all snapshot LSNs +that you see for any relation. But that's expensive so at least you +should keep that in memory, after computing it once. Also, if you rely +on that, it's not possible to have snapshots at different intervals +for different files. That seems limiting. + +Another option is to explicitly store a list of snapshot LSNs in a +separate metadata file. + + +# Current implementation in the 'layered_repo' branch: + +We store snapshot files like in the RFC, but each snapshot file also +contains all the WAL in the range of LSNs, so that you don't need to +fetch the WAL separately from S3. So you have "layers" like this: + + SNAPSHOT+WAL 100-200 + | + | + | + | + SNAPSHOT+WAL 200-300 + | + | + | + | + IN-MEMORY 300- + +Each "snapshot+WAL" is a file that contains a snapshot - i.e. full +copy of each page in the relation, at the *start* LSN. In addition to +that, it contains all the WAL applicable to the relation from the +start LSN to the end LSN. With that, you can reconstruct any page +version in the range that the file covers. + + +## Problems/questions + +I can see one potential performance issue here, compared to the RFC. +Let's focus on a single relation for now. Imagine that you start from +an empty relation, and you receive WAL from 100 to 200, containing +a bunch of inserts and updates to the relation. You now have all that +WAL in memory: + + memory: WAL from 100-200 + +We decide that it's time to materialize that to a snapshot file on +disk. We materialize full image of the relation as it was at LSN 100 +to the snapshot file, and include all of the WAL. Since the relation +was initially empty, the "image" at the beginning of th range is empty +too. + +So now you have one file on on disk: + + SNAPSHOT+WAL 100-200 + +It contains a full image of the relation at LSN 100 and all WAL +between 100-200. (It's actually stored as a serialized BTreeMap of +page versions, with the page images and WAL records all stored +together in the same BtreeMap. But for this story, that's not +important.) + +We now receive more WAL updating the relation, up to LSN 300. We +decide it's time to materialize a new snapshot file, and we now have +two files: + + SNAPSHOT+WAL 100-200 + SNAPSHOT+WAL 200-300 + +Note that the latest "full snapshot" that we store on disk always lags +behind by one snapshot cycle. The first file contains a full image of +the relation at LSN 100, the second at LSN 200. When we have received +WAL up to LSN 300, we write a materialized image at LSN 200. That +seems a bit silly. In the design per your RFC, you would write a +snapshots at LSNs 200 and 300, instead. That seems better. + + + +# Third option (not implemented yet) + +Store snapshot files like in the RFC, but also store per-relation +WAL files that contain WAL in a range of LSNs for that relation. + + SNAPSHOT @100 WAL 100-200 + . | + . | + . | + . | + SNAPSHOT @200 WAL 200-300 + . | + . | + . | + . | + SNAPSHOT @300 + . + . + IN-MEMORY 300- + + +This could be the best of both worlds. The snapshot files would be +independent of the PostgreSQL WAL format. When it's time to write +snapshot file @300, you write a full image of the relation at LSN 300, +and you write the WAL that you had accumulated between 200 and 300 to +a separate file. That way, you don't "lag behind" for one snapshot +cycle like in the current implementation. But you still have the WAL +for a particular relation readily available alongside the snapshot +files, and you don't need to track what snapshot LSNs exist +separately. + +(If we wanted to minize the number of files, you could include the +snapshot @300 and the WAL between 200 and 300 in the same file, but I +feel it's probably better to keep them separate) + + + +# Further thoughts + +There's no fundamental reason why the LSNs of the snapshot files and the +ranges of the WAL files would need to line up. So this would be possible +too: + + SNAPSHOT @100 WAL 100-150 + . | + . | + . WAL 150-250 + . | + SNAPSHOT @200 | + . | + . WAL 250-400 + . | + . | + SNAPSHOT @300 | + . | + . | + IN-MEMORY 300- + +I'm not sure what the benefit of this would be. You could materialize +additional snapshot files in the middle of a range covered by a WAL +file, maybe? Might be useful to speed up access when you create a new +branch in the middle of an LSN range or if there's some other reason +to believe that a particular LSN is "interesting" and there will be +a lot of requests using it. diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md new file mode 100644 index 0000000000..aeef54898a --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -0,0 +1,148 @@ +# Snapshot-first storage architecture + +Goals: +- Long-term storage of database pages. +- Easy snapshots; simple snapshot and branch management. +- Allow cloud-based snapshot/branch management. +- Allow cloud-centric branching; decouple branch state from running pageserver. +- Allow customer ownership of data via s3 permissions. +- Provide same or better performance for typical workloads, vs plain postgres. + +Non-goals: +- Service database reads from s3 (reads should be serviced from the pageserver cache). +- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). + +## Principle of operation + +The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. + +In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. + +The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. + +It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. + +Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. + +Objects in s3 are immutable snapshots, never to be modified once written (only deleted). + +Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. + +It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. + +No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. + +A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). + +WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) + +## Pageserver operation + +To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. + +To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. + +It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. + +The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. + +The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) + +If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. + +The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. + +The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. + +The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). + +A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. + +## Cloud snapshot manager operation + +Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): +Create/delete/clone/rename a database +Create a new branch (possibly from a historical snapshot) +Start/stop the pageserver/safekeeper on a branch +List databases/branches/snapshots that are visible to this user account + +Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. + +This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. + +## Snapshot names, deletion and concurrency + +There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. + +For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. + +## Branching + +A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: +- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. +- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. + - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. + - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. +- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. + +Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. + +## Long-term file format + +Snapshot files (and any other object stored in s3) must be readable by future software versions. + +It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. + +Files should contain the following metadata, in addition to the set of pages: +- The version of the file format. +- A unique identifier for this branch (should be worldwide-unique and unchanging). +- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). +- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). +- The location of the predecessor branch snapshot, if different from this branch’s location. +- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. +- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). +- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. + +A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. + +Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. + +## S3 semantics, and other kinds of storage + +For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. + +Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. + +Alternate implementations of s3 should be supported, including Google Cloud Storage. + +Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. + +The properties of s3 that we depend on are: +list objects +streaming read of entire object +read byte range from object +streaming write new object (may use multipart upload for better relialibity) +delete object (that should not disrupt an already-started read). + +Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. + +## Notes + +Possible simplifications, for a first draft implementation: +- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. +- Don’t worry about the details of the squashing process yet. +- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. +- Don’t implement rename, delete at first. +- Don’t implement public/private, just use s3 permissions. +- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. +- Don’t worry about history that spans multiple buckets. +- Don’t worry about s3 regions. +- Don’t support user-writeable s3 buckets; users get only read-only access at most. + +Open questions: +- How important is point-in-time recovery? When should we add this? How should it work? +- Should snapshot files use compression? +- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. +- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? +- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? +- How can pageserver software upgrade be done with minimal downtime? diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md new file mode 100644 index 0000000000..8429a2d9e3 --- /dev/null +++ b/docs/rfcs/010-storage_details.md @@ -0,0 +1,144 @@ +# Storage details + +Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. + +## Overview + +![storage](images/storage.jpeg) + +### MemStore + +MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. + +### PageIndex + +PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): + +* PageStoreRef -- page offset in the PageStore +* LocalStoreRef -- snapshot_id and page offset inside of that snapshot +* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore + +PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. + +We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. + +### WalStore + +WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. + +For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). + +### PageStore + +PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. + +There are few possible options for PageStore: + +a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. + +b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. + +I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. + +With option b) we can also treat PageStor as an uncompleted increamental snapshot. + +### LocalStore + +LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. + +## Granularity + +By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: + +* can we shard big databases between page servers? +* how much time will we spend applying WAL to access certain pages with older LSN's? +* how many files do we create for a single database? + +I can think of the following options here: + +1. whole database goes to one full snapshot. + * +: we never create a lot of files for one database + * +: the approach is quite straightforward, moving data around is simple + * -: can not be sharded + * -: long recovery -- we always need to recover the whole database +2. table segment is the unit of snapshotting + * +: straightforward for sharding + * +: individual segment can be quickly recovered with sliced WAL + * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. +3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. + * +: addresses all mentioned issues + * -: harder to implement + +I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. + +Both PageStore and WalStore should be "sharded" by this granularity level. + +## Security + +We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. + +Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. + +## Dynamics + +### WAL stream handling + +When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. + +### getPage queries + +Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. + +### Starting page server without local data + +* build set of latest full snapshots and incremental snapshots on top of them +* load all their metadata into PageIndex +* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot +* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). + +### Starting page server with local data (aka restart or reboot) + +* check that local snapshot files are consistent with S3 + +### Snapshot creation + +Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. + +To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. + +Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. + +### S3 pushdown + +When we have several full snapshots GC can push the old one with its increments to S3. + +### Branch creation + +Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. + +## File formats + +As far as I understand Bookfile/Aversion addresses versioning and serialization parts. + +As for exact data that should go to snapshots I think it is the following for each snapshot: + +* format version number +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile +* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records +* pages, one by one +* WAL records, one by one + +It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). + +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. + +I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. + +Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: +1. snapshot lsn=200, includes WAL in range 200-300 +2. snapshot lsn=200, includes WAL in range 100-200 +3. data snapshots are separated from WAL snapshots + +Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). diff --git a/docs/rfcs/011-retention-policy.md b/docs/rfcs/011-retention-policy.md new file mode 100644 index 0000000000..fde36c8108 --- /dev/null +++ b/docs/rfcs/011-retention-policy.md @@ -0,0 +1,91 @@ +# User-visible timeline history + +The user can specify a retention policy. The retention policy is +presented to the user as a PITR period and snapshots. The PITR period +is the amount of recent history that needs to be retained, as minutes, +hours, or days. Within that period, you can create a branch or +snapshot at any point in time, open a compute node, and start running +queries. Internally, a PITR period is represented as a range of LSNs + +The user can also create snapshots. A snapshot is a point in time, +internally represented by an LSN. The user gives the snapshot a name. + +The user can also specify an interval, at which the system creates +snapshots automatically. For example, create a snapshot every night at +2 AM. After some user-specified time, old automatically created +snapshots are removed. + + Snapshot Snapshot + PITR "Monday" "Tuesday" PITR + ----######----------+-------------+-------------######> + +If there are multiple branches, you can specify different policies or +different branches. + +The PITR period and user-visible snapshots together define the +retention policy. + +NOTE: As presented here, this is probably overly flexible. In reality, +we want to keep the user interface simple. Only allow a PITR period at +the tip of a branch, for example. But that doesn't make much +difference to the internals. + + +# Retention policy behind the scenes + +The retention policy consists of points (for snapshots) and ranges +(for PITR periods). + +The system must be able to reconstruct any page within the retention +policy. Other page versions can be garbage collected away. We have a +lot of flexibility on when to perform the garbage collection and how +aggressive it is. + + +# Base images and WAL slices + +The page versions are stored in two kinds of files: base images and +WAL slices. A base image contains a dump of all the pages of one +relation at a specific LSN. A WAL slice contains all the WAL in an LSN +range. + + + | + | + | + | --Base img @100 + + | | + | | WAL slice + | | 100-200 + | | + | --Base img @200 + + | | + | | WAL slice + | | 200-300 + | | + | + + | + V + + +To recover a page e.g. at LSN 150, you need the base image at LSN 100, +and the WAL slice 100-200. + +All of this works at a per-relation or per-relation-segment basis. If +a relation is updated very frequently, we create base images and WAL +slices for it more quickly. For a relation that's updated +infrequently, we hold the recent WAL for that relation longer, and +only write it out when we need to release the disk space occupied by +the original WAL. (We need a backstop like that, because until all the +WAL/base images have been been durably copied to S3, we must keep the +original WAL for that period somewhere, in the WAL service or in S3.) + + +# Branching + +Internally, branch points are also "retention points", in addition to +the user-visible snapshots. If a branch has been forked off at LSN +100, we need to be able to reconstruct any page on the parent branch +at that LSN, because it is needed by the child branch. If a page is +modified in the child, we don't need to keep that in the parent +anymore, though. diff --git a/docs/rfcs/012-background-tasks.md b/docs/rfcs/012-background-tasks.md new file mode 100644 index 0000000000..8692b187e6 --- /dev/null +++ b/docs/rfcs/012-background-tasks.md @@ -0,0 +1,38 @@ +# Eviction + + Write out in-memory layer to disk, into a delta layer. + +- To release memory +- To make it possible to advance disk_consistent_lsn and allow the WAL + service to release some WAL. + +- Triggered if we are short on memory +- Or if the oldest in-memory layer is so old that it's holding back + the WAL service from removing old WAL + +# Materialization + +Create a new image layer of a segment, by performing WAL redo + +- To reduce the amount of WAL that needs to be replayed on a GetPage request. +- To allow garbage collection of old layers + +- Triggered by distance to last full image of a page + +# Coalescing + +Replace N consecutive layers of a segment with one larger layer. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Bundling + +Zip together multiple small files belonging to different segments. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Garbage collection + +Remove a layer that's older than the GC horizon, and isn't needed anymore. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md new file mode 100644 index 0000000000..0c359028ed --- /dev/null +++ b/docs/rfcs/013-term-history.md @@ -0,0 +1,147 @@ +# What + +Currently, apart from WAL safekeeper persistently stores only two logical clock +counter (aka term) values, sourced from the same sequence. The first is bumped +whenever safekeeper gives vote to proposer (or acknowledges already elected one) +and e.g. prevents electing two proposers with the same term -- it is actually +called `term` in the code. The second, called `epoch`, reflects progress of log +receival and this might lag behind `term`; safekeeper switches to epoch `n` when +it has received all committed log records from all `< n` terms. This roughly +correspones to proposed in + +https://github.com/zenithdb/rfcs/pull/3/files + + +This makes our biggest our difference from Raft. In Raft, every log record is +stamped with term in which it was generated; while we essentialy store in +`epoch` only the term of the highest record on this safekeeper -- when we know +it -- because during recovery generally we don't, and `epoch` is bumped directly +to the term of the proposer who performs the recovery when it is finished. It is +not immediately obvious that this simplification is safe. I thought and I still +think it is; model checking confirmed that. However, some details now make me +believe it is better to keep full term switching history (which is equivalent to +knowing term of each record). + +# Why + +Without knowing full history (list of pairs) of terms it is hard to +determine the exact divergence point, and if we don't perform truncation at that +point safety becomes questionable. Consider the following history, with +safekeepers A, B, C, D, E. n_m means record created by proposer in term n with +LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. + +1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only +on A. + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+
+ +2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+
+ + +3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ + +Now, A gets back and P3 starts recovering it. How it should proceed? There are +two options. + +## Don't try to find divergence point at all + +...start sending WAL conservatively since the horizon (1.1), and truncate +obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is +reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. + +Then the following is possible: + +4) P3 moves one record 2.2 to A. + +
+A(t=1, e=1) 1.1 2.2 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ +Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and +A's log is the longest one, they can elect P4 who will commit such log. + +Note that this particular history couldn't happen if we forbid to *create* new +records in term n until majority of safekeepers switch to it. It would force CDE +to switch to 2 before 2.2 is created, and A could never become donor while his +log is corrupted. Generally with this additional barrier I believe the algorithm +becomes safe, but + - I don't like this kind of artificial barrier; + - I also feel somewhat discomfortable about even temporary having intentionally + corrupted WAL; + - I'd still model check the idea. + +## Find divergence point and truncate at it + +Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we +do that? Without term switching history we have to resort to sending again since +the horizon and memcmp'ing records, which is inefficient and ugly. Or we can +maintain full history and determine truncation point by comparing 'wrong' and +'right' histories -- much like pg_rewind does -- and perform truncation + start +streaming right there. + +# Proposal + +- Add term history as array of pairs to safekeeper controlfile. +- Return it to proposer with VoteResponse so 1) proposer can tell it to other + nodes and 2) determine personal streaming starting point. However, since we + don't append WAL and update controlfile atomically, let's first always update + controlfile but send only the history of what we really have (up to highest + term in history where begin_lsn >= end of wal; this highest term replaces + current `epoch`). We also send end of wal as we do now to determine the donor. +- Create ProposerAnnouncement message which proposer sends before starting + streaming. It announces proposer as elected and + 1) Truncates wrong part of WAL on safekeeper + (divergence point is already calculated at proposer, but can be + cross-verified here). + 2) Communicates the 'right' history of its term (taken from donor). Seems + better to immediately put the history in the controlfile, + though safekeeper might not have full WAL for previous terms in it -- + this way is simpler, and we can't update WAL and controlfile atomically anyway. + + This also constitutes analogue of current epoch bump for those safekeepers + which don't need recovery, which is important for sync-safekeepers (bump + epoch without waiting records from new term). +- After ProposerAnnouncement proposer streams WAL since calculated starting + point -- only what is missing. + + +pros/cons: ++ (more) clear safety of WAL truncation -- we get very close to Raft ++ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters + only for 5+ nodes) ++ adds some observability at safekeepers + +- complexity, but not that much + + +# Misc + +- During model checking I did truncation on first locally non existent or + different record -- analogue of 'memcmp' variant described above. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000000..fdf6885929 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,95 @@ +This directory contains Request for Comments documents, or RFCs, for +features or concepts that have been proposed. Alternative names: +technical design doc, ERD, one-pager + +To make a new proposal, create a new text file in this directory and +open a Pull Request with it. That gives others a chance and a forum +to comment and discuss the design. + +When a feature is implemented and the code changes are committed, also +include the corresponding RFC in this directory. + +Some of the RFCs in this directory have been implemented in some form +or another, while others are on the roadmap, while still others are +just obsolete and forgotten about. So read them with a grain of salt, +but hopefully even the ones that don't reflect reality give useful +context information. + +## What + +We use Tech Design RFC’s to summarize what we are planning to +implement in our system. These RFCs should be created for large or not +obvious technical tasks, e.g. changes of the architecture or bigger +tasks that could take over a week, changes that touch multiple +components or their interaction. RFCs should fit into a couple of +pages, but could be longer on occasion. + +## Why + +We’re using RFCs to enable early review and collaboration, reduce +uncertainties, risk and save time during the implementation phase that +follows the Tech Design RFC. + +Tech Design RFCs also aim to avoid bus factor and are an additional +measure to keep more peers up to date & familiar with our design and +architecture. + +This is a crucial part for ensuring collaboration across timezones and +setting up for success a distributed team that works on complex +topics. + +## Prior art + +- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) +- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) +- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) +- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) + +## How + +RFC lifecycle: + +- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. +- Add labels to the PR in the same manner as you do Issues. Example TBD +- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. +- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach +- RFCs stop evolving once the consensus is found or the proposal is implemented and merged. +- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. + +### RFC template + +Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. + +``` +# Name +Created on .. +Implemented on .. + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) + +``` diff --git a/docs/rfcs/images/storage.jpeg b/docs/rfcs/images/storage.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1d72a018dc462a74ad01bb17561c98efd0745bca GIT binary patch literal 431075 zcmeFZcT`i~zb(2$C{jc35SoA>B3+6JHj0Q~p$jOz2-v8G1VMTS6#=D)h|)U>n9vac zL8^d|f}$cwA_`jw;cdRZ^Ugi@-7(&H=lyZ-x#PMWf;oKc6}0XYJh&yCi`3 zw28S10D}Pl4Eh7?&H*O?IMd#-_Z1Eu%&g3N$39k87FPCs931TX*x5O_5L_IbJe=(8 z+2blR- z_+>SYu?n2K!X|fJQ1eb^@jm(Eb)7=z-%}N|+-^j(a|nxwiiz)6R8l^mqOGH=cSPUd z#K}`8re@}+FI=>-wX=6{bocP|^7irdyLl@(Bs45MBIa&vTzo=eQdai8`wwy+=H@*q zd0P6c?D>oG`i91)=9ZVQTD!V?di(m{47?p1pO~DQ{xCC3rYwH?y!7Skw`JPT^p`ueYd{XfP9O#+OG8JbeIy>Y>qLZQEKK4unK4OaeR=h&`X7m(Au zvrq7NW^r97yS&zUs*u}__Z-3s+T{JTy`lXxvj1-bi~j#LvVR}ge;d~t3uekBSnj?0Z7 zs_}T)jIT{|S8jTlWP7|VfjPQZ^7{K;AR?~ph5gs9fQ6bDA4^|9PT{}w@yr)aNwd=2 z^jaVfpm#tYA3XrpIu9L0?ekR&SklY#n*7W!y8iPAlP0WBD<)->Z5~-4-|w^w%o>uK zb7&XNQ-9+Vuj7J41E;Q?9hdu~rO6}~Wpt833&FHmY?uo8qA+)!&IeIvc$C=93VEk!d%;uyoOjp%rsZ%sUs zT1VF+zxFq2w3yM}jK7d}*wJTFY*x`k&Wv$Q@A})?j$XhTtl7TIc()5Mx4R>vu=~Sy zfo)aiyamj&uai6dlVWJw{_)G7(4 zMEla2KNw}yiAcN!?)1&wE?MiJLCYH>Pvbhxu&TJO+MNeRr~G|ye)e$)nUFm{|Zd`-n3uKg$IvJ!}F8T&cE&1$!*za3rvZ#@p zM1QxU&sWVeCR+db1c34OhnC2#oo7!7+cy1i4CF|l)o{R0?yYjx@Y${}*HE7di;EDi zb^-VFxsonq(@8s&c>DYO?gE{0HSA#VUY9NY!`sE;RVFU>WRHtKuV z-=r9ZA6yX)S&Bl7?gAdGhNQkK+FI9aX#dt|PhsJ+(Ugy0XaKgAXYjT3G`&pj`fldBK?a}MTB6MGh zYdoee{sFo^M;&;HVe6vs=%9gh1eV!=qg|Vuxc1ju$y9) ziW3edBp2+D+t(!JwR(u9{ZxuAt*<=u^OsE=RrFgGMVngyp_69tkE>4 zo$Sr2^Q*%X@7`rn!tyaDB?_*kl7V=K=7sK|Gr^~A$NUR*^Ddq5>4*3s+n){}FTD|Y z|4B>P{YN?BG_Zl2>1S{Z22e)$9m3EXQPk#~BJShom{Brgjwc)9Os>kjjI#|%)n7U% z3;euz$iW!(73^FzgpwIyU%Nkz)3l}e)obc>ypELDlxkF;m}g6K)Ib4jwUjL4x z+Tmt(H*Ib|Ms#6W$FIVdR2>dh2O@(?-grCr&)Z<{!G0lEs-P8YRtLUV z!*1J7Rm(Sy2Ro}hvL7&{)>%kaj_`SP*z%Ab0I}OW@U_{$)!P4}YE7%%bQh4@c(Dt( zT-XJ^vPPlVcY!NMb^9H6DllwMmP5WDeNd|N`~HySn3-qoOVbN0HVJee`0b6*$=`^X z3mc1LzbpgEi!mlNo&MNhy`cLZO?f8G>-jr(TJ__%i$>XM_izyfC7Bbnpl%%d)xc0k z(9}|IIspn3^_H;9l}Bww$(M288n80bq1!?9gJ>3pIAdfNKzO(O45~6Z5E}kMc3HjJ z+o6nkbM}YzZJA$12m+3jL-^^XGUAy(c8kY2|95Mf0{6d|5+JE(+BI zExy5B;LRYlfzciuv?CT^EJ7e<;~Lhw3q?SSUYw|rKiOH!l@MbmE}WNlzx-*`QLjiQqR6MENpN*CL48% zH-opR{e~7VETBU&Rulr5jZa`OxhB@N37tk4BdzL6p-rNfrhbf#@iiY|8Gfo&FPA0+ ze@ni24FIkw!#DIcT+EEDwhv&m8>XwC;9Fi(ZS%cva($IjieE_-vtx_nDq%Z%!w0^e zvlmFwZkW?l*18vA-6Y$5^<<;{2h*?RS-w(#@$LByOtIL~XOXB<_KXzIvz)vLPLO;1 z5p;9J8j!S=(J(9ifd18VgT!|5SZ?cjNu4K0Jh^0aX}iW`-)=Xs4Wk z@Wmc&7+no)Ts@W&+G<-O7>aanPaspS9Xo%Ke7{z+GWE_QYc_Idualu+P#WX#`0DDF zq$za~Ki+;O8T&DfFp|u<4(lWRXRO(N!LLw9$bI+2c}hy@XzKjq`L;ya0-|}ezWkN< z_wU2lK3cPVGNlVS^-bgDcV1Lu)c60YcMP*Kt*=k|ST`e>ZF0yt`WIK1&(DaAazhSa z+inBC^ALsjhHkKnGqRx|;~CP^zt==&bxWVSEJxh3JL{e~HyCR)I>+a}kE0r{yCkm^ zg&45{m6bXRIWfFDr8+nsP{}IYYP88;HNfv(-(gIf>k-WXaSv+2)f+>R0~G)`U_~cWYTqLJh=*K=H9~BoM%fkjIz!PDQqf?vV9imK*wI72ZH@% zbnG>kd<}}!v4EvlHXlbj$~|%?La&JH1GEgSUI06dG1`IHmgg=Ir(UuE29Z5umh+@~ z^d7WUt<-;X;e7be;EZ#A92fkG@ugqDWox#jX}Xa6zh)-TNNxYGEC&#pnTDi~Ed&3| zOw?DgyYD~!mbDwTX0zNrfL*{b;k4Fe-JUJUa+i3ntDQ{P~ z4;L*$5OFasiiLfJ`;533C7vt=-0qI>DwDnWK<#Md4avewO+bkDe@D~)r`7HM$Dc@S zL4Fs=ybY0pnghGQFSs!>4q3}aFA(Xr+TM2w_f6p!Y`KtX14W%0i78V>|>cGOB8@ur?(Cjvo`1>Xp72dz{2D?aEN7Ln{IQ0 zg7UF3b6no;ZjF7UZM=%t_od07ck%p2Xb0VA}mhQMod%VfWoEc>sgcwTtAN|(|Mo)$M z&cW9ZCnAvY3V(VsV7owdUJK*VkLU1pZw3dXUMM|Y9NGH(bQkzmKO2QMF;b!xGQ`#x z?4L=5ItizH|88(sCr*eO2a+J*Ld?df!cBK@9YfET%j6+Fd;b z3*=}#HKgy;T}{;$rNheHME*JU3qi$hc#<_+H=24Lmx=>Wy7c4Jg$Alk$%GE4s$oLY zs-J{Xa~hb9oEhIts*yJlzVzzSl?kTfBp@;p8d}n5QyA*2cLyF4Q#1zi0-s8SbUHTjCkrgr6u(;wXco5 z&!u1F+G)4;CZ1ugOifj$<3c7E;yymT6Riv<2hF)JHm7czU`HeR(=bYsUyjFdMk<;6 zx*a%3JgYiRd1->dm%%2A65Mz$6q92(wQrkYGm-eX< zw?dTgwQV_k$^Damvd`44eZ$UYGqU@t+$h^KY%fC&dU+&BAfxih#8ekSDk^a?QuOHS z-|q<9cPB1{YkXArah(5Rtf#_Zm9S3MsG}CW1q#qxE`+%-x=ov?IWb12&zJ2qW;^|h z?2rC%_$Z^MEVm!1T_!CaVU0q49sT93)a`hvkUY%VB_XACx+&@H+bN+rN#2upOEW*1Xr@+GmbHNqY51OJG=>N=xzW~T~+EDj3^5*1>8e&i}xyDK%Yj6|k~VA}tVFo~bA$iw91Kbc0LH+jzVDV_ACrt%)DJX+?-)oW~ldWEY3_!io3N4sB9U zWfWJwV*xsq7E*Z*Gfz&pIca3eH^4^M&M}68AnZ(Go#;+=usWw7lG=bH3!=oMe@GR6 zAOvok7QC1IGLq~?R43}MrWqSr&4_IHBI{eaIY;8*eTCHQof?NiJ3Yb-(QR(&1Q*sd zXKCP3xc+?G;BR*U{}E7$K-aPBnL{QJsV!)-$hL4YYx)Ly1<}0rbE&{pYiRq7 z{YYVIo7Mcg@D5-TJyS+1qiYd5BjIEOU4cN&z+GvY!};u_uM~t921D}Ea?3zVP=3qW zqty|y4&YD2Mh*l9UWi%wMbv9rC_}`R4yE=vUmpS#LDbZpnt`8cPhxh0W=rM^RYT`R z-|6@Hz=?r4FEl3@vvIk0C#T)Bnn?87nAM5PzW%~qW%knP0G9*&9=7X3JfQWeVc!w` zB1DRjIg}WDGZrk>@1l&JH+c;m_H*R%IQ7fdJ)bACOMNWL038RN&omw4J3Zn#y~tz{ zhJ1Z8*Opgq8P2;h1s(fE^Gz?}T6yZ%t|-(FwLi`XO(GzC7X+x)*+I{I#VuPMXJ+QA zKadkmnC{RAK0V6L@_~Os%vLbiOUcMtwthWVfp+=Da3fg?_Zd~cYXg@LRjDi3aMZK# zqzYuPQaD-rLZYD%5$RM16+ZWOrzg6Ig2@zIDjyNaxjYroqpK^fkv@{Q;BjB5H2kdK zJ%Fqg(_`( z$yET?DPGM}cCdg4{3ibG-Jc{WtCMQl=3mE5l^qZ-Pi|SW)nMP@s1`M`y8xS!mHT3i zbl8){e#eDem5=rv8~(mmf;1mh#;Ls+q;j~A4MTrpTncUNh+iPt-?sTbT*pSNs+DF zFY^|B<_2FjJYo{|@mNlS$-k7z=!iZBfbu=i{WN)zR&ua_3;ddVdFYJwa^3;o5~&z- z$>K}fcU? zq@(Q!{B^?rI^lnv@c$QWRm;e#+|>swKG{^IeyY&^_4tZ+Yx8(fK)s8Frane51XdoI_EEtn`?Ngu9rQ_d9Ltwy(3XVt*i2VN$mawl<7iZ zf)NSu1lblhdxL}+6DVy^z8V9Y`h{i#gRJx%aVcf%*x~19;xfe!C47f7PJcdbHe{6Z z2Cdt{#u#-)VpZt-!DK42OPXn?u!$)B%|&AZJ5b>_#cHK)Zl$R+cr|s>%r5+pEsi0LPO|P$z4bL4GLs9Bpi`!fvjA=b+-K6-T-=Bi7*V*B^Oa*v(2cNdk5f*c;sO-c=2 zr-rVK2EA898N9d^LNl%_JQvlwv{N|z{qvu&b@cU?AtGU^Zx#qzb*F7pYC4=%yIP2x zSeU=7z|yqN3GdrH`AgZJqCoV*#q+$EvVOxuHV=NDJ6ZZ9(*&9|M^$(+dP`TA-yBDo`f})@)BhzZS5izZ0J&#=i9Enzu z>Q?5tq&s6K{&Dv1p(-2-7{qIptgudCt&A4<8qxK~=oWq-A=}XsSnlrjm)XAH%!$)m zk*!PD!u@DSe?8X(T-yce34K-xI7x1GH+E^WnD>m zo!1%7->C^lkrldnHRQz}I$-KU?@r&y*&b^JgzxjJ5Fn3qII4%~% zM+3~Zh0yA|gq;>40;M2ZRX^#r^MN3D+WYlWPqXrG+rJO`acD-J=`O8@E*_bN>&?!6k4Kjh=V2W!nE4yt8>SN*vjQ>l;Z#!tjzaVcle0r==?; z@WsuF&a&ZWzE93?Pz}t64G(I(`4o66<;pd2Yjct+5W4jxS!YL!98$bkfn;`F_#TqS z+}eneFh%+md$ z9<&I?4n&h{5>SR<w~1@JXlhN~zx1ODAu(sX5#1|^NY zbUym=_Y8UL$n08oz7_lO^_g#W5s4Yq6%yLTNo%QaA-)pflvX~?^^QZp2tOSQR+V5U z84@7k867tD8_Dur+}*_)oJH%cG^c-l`$56P4d$qwQeh^Vd-2ZZi72O?D!e`fg)Ag) z^zR};mW^9U}JSF*y3fb1qKFY!R2Nul=VZd$wSfGVZ;P`!x%yf({IyS$ZF~7SOtvI z1Vie(o`_b**3n(yp@Pr*+J^Es3Crp4;D^E#`1dKm3&5jL0xax7r@Qi_gu`m~;@64` zMA=!&_iVbIW3A-BB7SdP)MWi)?!A&`-Yc)P0gMN@O}juQ_6NlgIooCTFxNZY$ld7r zcj^62>)4T}s+OM{^zCOv2G0!5&D?0`2`}PF-gpEyEfS(}>^cZ(x;}W}1^vWSRkcj? zQRxn|G3LzLam}8zl>~j+i@(KWn2gz+$VlPu3>olB3HZgHYEoSC-7zkn^~Fu{%q58m zr<^FWDUVv$$M*LISw7tm`~)EMce1d1bGuFAzR-&k#P7{5SCfRATL5aI`<%o5>Y|G# z{N=-vfrsC{&u;PcnPbzp0jkI&=>#F2^x5tn9IS=``yL|A@%~ri>qc68flpIwM!&#i z0nvmwU+KdYvX`&8nJ(;ufY)yEOD{T_Aq(DqiqQvQ)pXNlTHuTQ4=RG6E&|!^23spC2|bT+Om~ zmu`K6%@X^GNd~T~SBfz#jUc>w0aU*@wy?3-&zFlD2#`rK?L@iW18$imHQw1g{S?t! z4MRhP(l{lw!-~a+p#VBlRUN=W-1B9c{U!Tvcwc{e2m3!MvPpJ-9v-~BIxE(9_F5(E zE8d6}lwhPrCAiS~?vV0#Zaa?sg3QtJ?oquh%g;U6a)l>P#~IE4kZndiE;k4<%3 zgKke7=_AB!+4Ue22%Jp_z(2HpM6mVf@|$XYJ(UKPw+YK>tj3;O{D;lh^r)_T*qg8m zNTT-BHEHP`YB#&2k<{S9-jcAi6^o5XXJL2o`kd5r?LADMKWCyYMBj`ymZG`t-9i|4 z6pcU$gt?Mf+bD&cVB>+wb~_)BZQU$KTN6aH?x~-Xwzdo3UtMDPWX(FgYC*$M*dUYm z?lgobzSR74U}9zO?I1txd_Gl!Q?iC-o{3U3b^ve1WTyh)18;=zJn8q2D;b-=m=8g( zy?Kf|E zg@&m>?XS1yPbr$byu+{SaL-aQDC$4NZv`2VUMNUEv3(w_AqCbzrQ(>8RIzVGxcj2R znL-bIHXm=%!=97+S;oa4UaZ1zwmSidZBu$A_=0+~uaFyKNZp2Fg$6%MOMAZIh%dgf zbZ^WZ!4bW1DNykK{ZI3aj zVYpRY(`1NARiCe<@vchNNssFjz$H*=u}}bG0bU>%N`ZqNEqLy!rhK7SuSeHH!>7)B zq*jXA#2ItJl`o4jY2^XyYct+Z9&t^>QS)$J@Ak+?9GaTum*`!F^Akf;+y%n(o7Yk` z-9NPD>dCI#`I1KA>R2d?H1AX&M8Qr_*_`q_HXWT@EEbEGcCrqzv<(jYq*sO~)dpy? zEs%b7$nayQb^&R|0XmHOET)>S3L+}$R_&9)Hg&Cy<0pp#GwaNqixb|N za~L00r?@7!pCZ-7xyaC3;^+t*CBWZ&>>0k1x>JFB>hdnZAz-2iQ)h%JPGs=*15q~% z8@zkfB`{v#u@Wdl$5Ok`QCVA_HxBA1N4A?KWobSwGD{$PSc|e+Ukm$qxCfpMA|UK- z=~Yk031DE0=}`v6=te7+WrY7z|CR!uc)L`uu!(ALQnFQLi0jO<(1gclhbTlMwAk^g zVM-gBR7B?pH$?B~=47+~yN40UgKf~h30MbqfGCbvz=VPB*B7mlkgQEs zakGsEO}%#`yowrBFB$)fqV|z34Ex_M-a2#a3jjClz(I%=R@||3~jQ(+i)TGsxk{Vg)bh6Q7Xzm*E5-btv8${Pg2s02v9x0MnS;&%y@J z>qg)!E_?j^m9W{@_V8(7c}RPfmkAWRH#c(u1LCY6#XALyz=+dCdaMA<5dv+h?dem+ z>67(#zKTRmlOHGj-r4GeWJbR@a$?iO!B`j13l8=`)to5k-^Wm+TCUyIN%gl%`muhZ z{h(KEX1oUevn?AQir;I-`hu_}Vm%H5(?aQ6BSH)Gb5QM0*T^|BdFpv1yR-`wn*cI|`+|2x25b~qm?aCMq9ROt z38G*;RQ7cv`9WeAp5rymtM19S`UQutAs&A=$I3sQY_oZo{k6PjG3qP}aE0*>DS+{% z#gnGv8FEm_21SxALv0*}-z7rD!Et`?RE3_Tw2FVqe90qyOjKt{khCrh@H$dGlkbk3-(RLI*zrz*yQkrSLA=7sred z8N~wrtq_)AEIkhCAVueLQG?IkxY(%ly8BF+$eRTGR7L8Cq~&*L1>#4RcH%>bzw>v2;K#3ThZON}Zkz|-(*za?UEhoXXojSX3M$b){o>17 zXH{$Sb4zhL$6NO8KY8tGo0970ldIR0fMM|F-WXEQzE+ZSF&d(SJG2`wK<&+aIc{|2 znul&{exjU>jiQFwy0EF|zLkr|JL7IP!U)p%J$h) z5u4EkCZsYM05bB6XtJwLra#Q^^HwmSic!A{PBUSuJVU_Xw}0k z=3gVCU{W*RjZ;cbPi{NNlh2y279Vgh&OMhhMR%sUl$G{y!* zc#ooa3v%q9iBN$@+hk@<+j@mtT+W1lWl=~M&;7@3j@7|J?Kq+8Y!JD3o2x#mqI=IBv z(swnig}S-V5-sLBzR7E^)b`6~z7p1m?eIOO6jcc$`lHlnJddgM+x;O}Rzll!00w=i zMkFrFIPh7Ule=;q7fkxff8eJWHxRN3$tO_;U@kTNJQ!Trw9{L*VMD#|VxJv27wd3N z%BDrAo9DTi0-v!cj$@B)(WY0$Xl!J*Lt?nu1 z6i<}9-2LOQ<+rdg>^K@~1f)8VHtL{red*>l?VtUel3~i4>Q=8+KOk4}c2K-Ame1Tx z$b9i}$w9!NFy7@LjV+lW2IBe_W75ed1D%QD3+&-0XtpOoCC+AQEHyf(%X zrF0Bf7P)P|OoRoZ0RQF%_S&(r4TO|NhQsiII|?!a$a}{^eU4YgZkqI@cqqehe&}?f zAcJiojGss@gGx%ud9cvt_EGa+?+%M(8ILiM-JMw+baYdIfmJR{P(18-iS3;9_93Jiq zQS5n(&e^_)(=B*CE4`HmLG`wsa$39`@1OAgR29y>hWeVpG_vYX8>F~)pyOw~lTjiV zfk}pyaDP6Zfe`A`%zomBnqGVj*(x2058NlBo1+e7R*BYL zGN?PS&LXq(584t}9@9oB-u(plGD8tUTP+9h%5&eXp4Q0ManBUFgE`6zbxyuNxeKUj znVg+3%^-bIDH6gQg*1t0RonJpGU-q;M6zqCF4?7}tA(k=;cc~vIORfE3ice=n9&v6 znuW2C==9UY_QPaN6m zKzxJNc7e8&(^C@_LF^7|S1s@NEFFCQ=jEiFyRuuH*{H@babb^-PtQF3EyG>|slXMb z*eNU$WcQ?UI=!Zzt$1c09=w=9ANLQCi8Jk#Ln{Ncr>7i^d*1#-P5n3Oz;93vSBh@L z@na6{iB?vGiQurxLkDjWwVNsrR$}^idsXQ}!iPgvxfcIXRjec-Js0wgovOpWpP(LP zysGxiliVt;iN755E5Get)SKocYhCAuSO1*g$9{E>GS;`6N#2VQU5v~)N(N)H7bEDY z8pVPG+PB}!L?&$8dP1;!u3tuTJNg)N)J+`A9_oVW(3-OAMvCqg>9XVzwoB&1=@bis z-kZruZ;cAsut|v_E28>?A)zq+HwsX*moQW=*iaFxFzPzgFN%Zwi(VH|THwfPkkG6c zYWV3|Zb`srcMsPC7f;X4hpGbYqpZaLEH!PQt|c3tsKgu^H@f*eOm6$^wE8X4V25_S zgN5I8H-iHA`zyM!y{kSGuA(n_2ilXfws9IsV$ zS-LWE4ZKOK@+{|1_stuOvJj$O^l9iGLaekHjhI5`yiJ-9a;O&(X_X=e84PUlHMOYJ zzJWz~GP43HF~-uk?L*MkjxyQ+-4CjHA@CKeiPy1d7(e^mV;4xgXSw82lPm7gUl*$R z8Y<4an?t;DnEJdZc6>!3*#BL?x45OKBHS29Ws){l^L2v zA5jd?>qF$l%3=;rU|?0tYMR4Oh(Y{vPg7nf^UED^5M+hjla$uu(f~f=S)oN(LbIT( z^^C+bG$DZqR6Rj;-iCIltnLDNa+B{p!@3p0Nr?#MhoX-sRBMz}hJZrBJu8N4x{8Ff z57Js37WN&xpIVkwHRWvA+sp`>X>9iOFtJq?OT{$(LrfpoaE@4W*UT0!BGVbH7?tdP zS8hC?AC9jnc0z3xU4M(FYIF@xS+9j^{E9|(&dG`FC?9ysIL5BV6{Fci>tV1Hny0F@ z1t$x;gJeX`IqST<_u+vI;;{bDdSi+B(*OzWiA=@u;f*kQZM5{Bwdh$w_fcD--OF*x z*zf)KUBdX}Js+QhBG+?P>wc%t!5yk3fn#fO;;He*AHnb>Lm3j$1mx(2x5M5gZ2}@Xa_76H54o! z%t)v_t~biPpQ+pRac9YxXJjJ|%v?m?#j&-YfD{!p_ZmI4`kO}ARI3mq7wRrz1|;x& z&TsqD#(2xk_eF5k0+Zm;MJqncNys^^kqm3;6j?w_=~V3dQKPL&S%|!wI;HoqykoOl zrIs0zfr}7&JNK}A+=EsYR{x!n-f=XDoHEN(ez4MQqJ1>ziEN_kl1Z-eMOM$ZkGrL~ z0fD`!6Ly}%zv=<&ZJ{4*qs#~4P*YjDpwkR0K+

z7l(=RCuMO8-2aITdx{aMA&ZLpREMSeS2w15purz3EVkPKPU(k@+Hy z1617Xjd+*(%<{H(v167J{=SHu(1&VsR8r;H!$prcfUccPm#Yb47JW&*_xY{e4z+oRL9`TxmD>X;5@cRDLCPMh5gM{D@UqnO z#LAwnoRol}LeKF_?d6>bsU=K0sbVp2Di?0ViuLUor-Vh*o}fEOP71B13)&&wS_m1Z z&bAAkzuH*t7wl7T5UUn*Oa1uV<0KjTtK7T;GEUKCH5waTWeq$F_3f|`7;}sc2Bb*c z^&dj|ZM)9UiM2oG7!8np+ER05)mRb<5wsG>^~c5j1m`I6o%2?Z>n~h!{c@i3*4r8* zzRIgD9QA!^f(@-V2oZ`hBog37H-Kb$yFO%NOB#wREM^1RQE-%uK@JM7uQSq4fC(~ZtD_s|7oA+ z$^A(tZ)4BRp7-=PUspTA^#(SL9YYJy{i!!eORs4N@>;hA6c7G+m*ZEq_Ge@aD2Jbu zJ}z$>Vn3N&Yoo+*snX3SXgpDw82Jyu}~0{-tL-xSNH5xWO&iM-qxvmm9Gvr zQ}60Eu*e^|ybt21BT&=B8l|wm3f>b@(+jOAV%PV_$cwtR54Li1^y)}iZ}%tZ?&lGD z$n>-T!3?k-gLJe9|NO@@G@EsPd-KJSUI5t? zx+mAbeyy?3L&X9l*PP9FnY6tB%P}a=GR61+3uHdA4DwcfIM?{)c@e@4giLZij3Ic1 z;wl+t0j;{jQ`%1Khbn)m@-IZvlgG3nzkjo(AFTn(c$PC z6MW3GWH}=0o??|)md)F>RGYL9V`qBa<$JkczaN1u)<8wBk^1&IP>+O7glfa7b}5S5 zk@;n5&3LFdyqlY^Vh@>5%;$a}cm;@k1ZB~ zp28{}yq<@Qij3v2e>0Y}T4|sqQ`q{^u^M6DDRy0IR^)fJo14Nh<^=xuEWx?@UxUBtaH+}t+B7=vhvD^wZAwJVX$#4S~OTTnqWzfBz6PuZx6mxL#+ z$*qLU)*Uwb1%&n^AZmsZ#asrxsq38tpou8<6|0T6aCUdCU&lO_61Oq&o)M6{XX2aw z@GD0#kmg`)kUncg$w|d?(PgHvGW4TR=?(4zPUCq(@u%t*Jc7f8T+Yjz-l%(V5HkCr z7M07eF^uk>Z{bZH>0Z#Gi9`p{b*lWf&WLPAu#CG=-jWx&rdBQ;64jVnXF)G;+yL0} zumfmmyfQtK`k<`_x|af#^uzeEChywjhKUEBdgoZUU!Qz$GIRX224`dKk~RWhIMJvd`Sq+uMiCF>nC|t2h%-2l z+z^E&2#EAVR5&b~T4>^Wn)m;tIt`{$Jr?F1nu~tsUxyTxB1`~nB#|KwktV&pZDvkH zG5-ji%$N0}D}&+FH7~g#{zoMJav3OSBLO^H!LTBDm-gHiZZoKwz_TNh5YnE3-2M|$4m(xYBlI&M8+F~17TUEI9nsP=e0tp>=IqPy#?QHV zvux&~LxWs5^jR|9*#bw$4*kn^+D~&KJfR<+G`jvQOlAAR)XJ>`O-?wT9%vtio}PK2 zhtvMch^-JFbt)uHS3=3= zK3~`2BY*DQs5CL*2ChKF6oT4*Vk2ZU-({%m*%ch1dlzJb*JJELAP=dk-ZM2s81kJa z-BqCAEkAM7mCA zwD(yLcJ#3WQcA0aH1B>KGlSO;#|!b=gA*tjf7_eK1S5`SWZ?TFtF#W9{dg0TB@{UT zKvb3h4}*A(66jP6e!m3qn)qtJ2ubnM>3DC5*I4ZNrYV;WN!hRbiO#@M_Z*Zjjw2Oqj)nB0FRcxO^L>V>%l_)$NLVB1zuc~M0*wWYGece_G9 zM^A#0O}V*uxm8*C%W-+zghOkG^f)e=ebTybEPM3}#7m*(hW{Kf#c)7f8%fYA+)tfO zhg7I7h?$Bn)%lG>ePX@s-*x#lx##ZNv)MQRo?H+$jWL!KS?S%gzWe6gDXfJwB;$!r~#9%2H{gj=I+2jD}nb9O<=_ z#qGHfiL(3+CkN*9c+P$FBB2tqw~cp_&%8tiUt*euj8lmGcUZx{Nh?CeDMbDYpZkgT zI7aryA1lLpitS(>L2G+X}@CKOEP-a_$(D!b;9%(L7?mgWibSXh6*dfJl z01rtk0-J+%4_Qb*kIC?WO?yIA6jbFKg6GIqkS3T!*P>8j>}Zz7*(0>n~xCOOyh~-Zw)HD=?=n zsG(5VpU`BN$YA!v0ZsE-owJ7?wlurw<_rtx)CVN2GWW>I=9!-QcV>zmbKUFL5u@pK z4@;20Kr$3%e_?X4Y5io4_M+mv;(nj|7FSa{4l0Ds%bsKs{*q-4Z-#OV4be;FKq0^! z3gQk5=>je~zpq++Fmk+}xHSJDmoH&oWsjVp+3^?q-DM(1Sqh_3kc`+v+~cKRQt0fr zNyGofOMk2xytN!^vp1;Vih{gDbFtT%VgW^N!0;j@rJe2Hs2jsF%|SFa5F>yqeZ9gkxFFjAvC=oNJyGRr z?R{VPk4!Vsj>~M=DWWu1na)Iwi>;v_gkS+<^?Fk0R_&{X@w-D)bwX;d?`oLs0{Rky ztFxjhpE4?8-*9{w2MS^zMvKOl+OAKkSmXO{a5nS$bBC|O!^z1rV{+fGmY$Frf{1D5 z&CL^%Ot3-h5D^lLX=S91bQ&9Eb#R+@7<~Sf-H%Ge)I%bXyvpecxyn0I#+Sv8eM#Lb z^9j-Dy*qIR?4_wgd#IpXi02$_mx$QdLMN!7DKM!MA9`OQ>oY0Zs(df}>atLWQpsUB z_Q0M5MGBfVgq*R?U4R<{pQ2lk(Lx>det5NJsAHwH!9q^6Zm(nIqPfKF>d*I$zdVOL z|8r19gXkc}3WR-!9Mhu~ltc9`8ykf)wsN*%Pa@}_8s9X;W>$p5;H5$oHxSZll^nct zf*hQaR&B)h-0^#;0dquSfkbB4{Ha@0lnZV{#!SXv-YTR7LWY0T&OMYa#tdAiE^YQ9 z1yLN3*TvOj5#SPulF~ZQAODZvKk1dG2fx|qts4EFAF=-;Tn}O?d)9B^##5-*VaFdz zAthmk-Lqk>wX{bIK{G*5ccw-~`{XV&+*hIC{pl)0p92uHdVAf#F$?;TX__8Sgxnb9 zw*`%f*%8}`N(@o`v@uQjpnK}1xr4Ojn4oiQ3pzXl-n3JUR|xATBRki4p)G|_-|8+P z^}Y4XwWjJ=cf#;X$TnTOYj>mWAH)B+bj(3O2gMUM-2)+t2nbnF>QuGOnBd1=y@&SL z>6r~peyl0PPD!i~J>`M{&}O}1f+tkV3E_?G!R9ZN9Ef+2Q}TG8H-~5}y95T6$hHKm z%b$7pHfreAiFWtmBBn)X5IzL|eAg%qI&6yAvF;lXu9ViFzi~3dsqr<2pe}uxH-JdE z4Iht&sjptyV-Fq9@b0{tE+WJpdLZ|A0DgqfBZJ;e}BN(-!4a-p!C`%ay|%+rqKvPFs&AELD-u1-!T5)C5)jPi}SZafZsQs(P~y+`I1sY zM$S`R@>)}7+sWABYv}8*j;H6swoePXcSp@1dBfH|`d>Ux!Mf=x;5@Zt5&QW0GLge5 zs0@5CZr<+CIdgT>O|0YH_)D&6o)cUzq}YpiD1V=!_0Kc3;8|mkMW8v2lJ!S(H3o@& zZNX}eu9m!-)D8iPONcl$0?p6S!}I^6r}{xPUwGoV8{|G#?O&KAQMKsR_pd+ogb zFZSL%9_qf|AD@Y`lqJa$naZ9-_K=|xl2F-Wie$@L7>pT_rR)l&LZJ}Zx5(I+B-t~z znL)D8SjLB0{65|1ocnv7bD#4)=en=^T#tU=$Nld-jQPxa-mm@ndOnNx0jb(#c-t8V zBJYRgDv3OfFIfjJxH~v9-QYO!pqAU}8#kM#RHGQv#6Jmd{)O;HQ>Xz3Mav*9C|Te# zZ@2%{o1#LbY75Q>H&@S{oI^eHpeB-Y|BIGd<}16U9<_h5H=daI|3UaisxHP8GjLfy zA=%>XOJ;a?kZ$o~eTA6%dFue9(vfgMHG^B9B0J2*_=NE!{a-)kK}=Xe(E}Nksv&Sk zHce$%oTe&~!xDI5r(8w3=B1hCae}@JWs2J6RrNs_h z3<>=}gMO`ew%)@IoNG-NuM9N`Gl`Qbyzwlk+)ZZd0j53*IX4k<#)fj2UXNyD^r6KO z%t-5=mO#CzbU{hO!{;DKomwpFJF>+-}wW%mVJ>^*zO`A-K z%|y7JkJXykp1!cD=6JPO1gj{uod#L_@sA8(B+p1J58Nf#jq-~86YwrXlj>hD3Rl1I zq73kF@h%9p8QDC)D^-Cvp*Wz^?7Z}hl-M*MWHaqI2lsoJEVV4?uQ2#mSaysP?4c@b zAIb*5!sluoTanH0uJ!U@*;#Y5u}nb;^Nz#FOirP*KaZNxjeEIj?aJUIPsATc&>B<@ zFE5eIyD7W~Y)fZy71yy%O|sC8NQIHHwB5wLMsDW@zJ4s6DdUNcINlHnH8bK?!Y}yZW zv@*RbN?De_w#vpVlJ)9{$ms8jj!-TY-!`VI+&kCfP#kb`743)W7 z;{&nJj&pk--yeeP2FW^`Fb}P{+T9kgV56o;*a^t2rCD=eY0KPWslE$>D{?A_1LE%#_$sC&K-TvRJiDk zLy&E()nLKB!y;7MftmeQb3=XOWBQm-sev+B14ulr!swo?8g52_+g8kzT$%^)@qFLx zY+TA*A77t8WpiSGvrt>4#OY_H*4LQg)}iYg5MJmScBzwU0KB8v07ls~Z4x>>dw%#s zWK?Ev`eA*2d&}&~nkU7pBbOH&vfj%ZL0lMZSQt&0s*gGlP4tg0kCnUlG5F$g=d0p1 z=A*}CBIr}6dNkLxMb44+80;0)Gf1u$RkSdvEt$wA*m>*wX}R)&M>2c~Jc3uOEEVzK zN|nj6)<`;3IaQL>RfV=KA$L#QkOekr(`c`vY%h&LWMQQ@vN zYS!(%#2v+VN?r?vl&G%aWypp}5s6YY~ta@(`-cOV*v zAoFM;^e0&9$|LyA!;(bF_&U)ybL#H^^C^>R^0Dih{QIxVJuAK9?|wHSQ*@Wm{jf9$ zDFYC@T~vTW_gR05zqO{4k&85+Td_CSvRqx3{p?dbmwfv;m#1>%U2WJE#<$6GFe$uC z8YV7fG{xV?u-zDx-QduYJ==4os&CoXFgjT*JIvl*%}-x|yRrUC%*BXS2xR3cy1lRAJEO&mPuTEBb*C!yOC-z zitEX9vA{XzP#)j1)Ri5m(m_R8t`*NP?ndAac5uXSb48%!s1BIM3tDQHaYugp84q;yy`Kh#H3iF<}p=omu5(krhUFe@o@G+OmT&azuul# zSrzx6ol1EmJ*5_>&FMA&StBo#D;uoEqY_cU;IA0~@^n}<)w~xg8B(IkP@}HhcFuf6 zk*jE;1`1T_!-VZs=Q$uKJ&@vc6HAYKer320PQB=&Zn%*4r1*VwP|>b?gOM&=B1Avk zqYf`uZ`^0=E+>{QZQqgVUhPBGY}04V z;r!mS`u~j)#NDJT`P6)ZY-{gg`EF{Q((HMLpwjs0{Fku`9 z6GUk^yheUZ@+%2?RNhl$|wl2Du*nGTgCALCS`wO_VMYrI}Jr6l&gxF#B@3ui=2UNc-;#dVdYh zEW~!&K{(^`Bq(uEd=g}McpVD~mZM_OpD?8$5k~w7)`Ywgp?UNT(zsRSu9aSP#mnY4 z&+H`UA-VDs;#g6tC`xD$2ao&tIz0~F9&gEI5Jo6^zrst1 z1yn#Y21UfK!s12Pff5(eFeQVOTxw$wz!C9!>5WQ8o=dER%4agE;Hj^DE%dm*G|1t# z(Y{nhy*`TmB(l3-nM31zm?athP%>pwzq~r3fc2P}LeI%D#bcf^?pN4Oo)%7n05~cI z3)+5?B~31p5?)Js6U`DLdHl)(_2lTMZ+!u--{n-7j;Z+Yo))^Srx~h!#*HFO&&Rs; z;4FyyRMSPj*2&2=e^fA7+8Cwk^RVHcj%sK>Dm<02{J5)=^@6I86w`BH0lY%TfviH* z^;G=i;wyZY>}{l+WB*l?q~4Xc1Mj&LmfU#c%^v#IOpG>5jj!fTGch&8NTw?|1U{qX zIgFVxgyp;$(L``?OFeYg3emW?a`)#TgGcuSQ2Du?SD1FOW^(y~Um?qtPrYgpg?s95{M z>i5D7ss}2>o>d7+R8CY$*D`tOJ<=9p@`8jYK7!EZhcNiJ85I^SD^Wgk8M`!hJD)gT zBpwm%_S!Iivd1Iqu~w78nVyt<{v)@hynix#7lBJ&@$qn;LQSiwB9^L%I@7n#Bxl@U ze5#lsCD-vus%7>Taknk>f>@}~V^A!n2XzP7A{dWU;ayRe$k*lkj#O4u6!#Mouh>v) zxbK9s;k^XE;R*Y{aLQ|I?d}DmRS_TsZ|yR=+MdCV<{vR4#2$0eoaslw#Ro2CdaS1D z`K*3;b!Ebi>&O!1zBbH&(VS6*%BWy+)q;>KAK!e1oH8m?P3XL3v-lwfvg9Hf+bQ+K zO7-aN4vqT|?o1IuA{_kxOL8PG$|I0_#DS{qf)=j+s#IunbKp&XdEU{aS>wmcb~YKO zN>$_&PI6v+qISB0)D)Lkl{Co?5LUIq&BAPX*`iHt4m(Nl`w~ z+EjQRcZ`Z5=e?jC@rzTrG{+$Zk26TYM*46^L}qYc($LYtd1%3bDa1Z}pzMGY;@S4i z7e%D$kU-x*Iu-HB;6fRZs#^Kf!-ty+gPOK;UVfw%yK-e-AMFcYcj{4O=o!Chacr{<{h~|6N{xLly$n^e!hWDY#7dTpfFQqNmSS}zcKCMV%r zQrljAjDW?1`llF@9xQS~FS7m!?OJuD#Oms(cDc`KQ}%%VkYKj7Nd}O~R)2~ygsIw? zYy&^=B%n*OS361?U&FC2Dj8JpkcE}5o@vRbF%`QLw`Z{74(O(4wDw~fFQGM&GQ`%m zU=@IAGVgR3pH?{(UHKG;&UvHsbX2jusr2UHqebStv1}6SUj)n$JCJD{N*R=$?9O03 zU-`lGf*z{`Dnztw@wCNC>1_ z&(R0!FcE+7@td#diOvsUODGp0wX#%2dT!mn!cd^8M!5t& z7CcruRCih{Y53GW52|MM)dbe9s05DYBcPP+s|jIxX1UzM$+Fuy>+Kqj)e*;*opxFuF@%u(_2d9x$W zeo35Ijj6=!XjJ9##H3jP7Wi&L+d3_XX)8RH~Pyx+1p)w-4tEUwjQlo zT^6fGhjgf9Th1e=z*Z6qNNyS$%^1oMsq@AVCPzF8_fE}Ga&JF6`Zl&Df172*X? zi8EuUS4c_c+nNcSn-qKS9kI0YgZ1(@G*WU#TGRvci z>Zy3s!R!a;-|?&-3Dta=xICR?U=LM;YD2d8iK#SRIOQ=&vGHLFKfG9xZRtYTIK5tY z98r4iZVu{C=Gdz9iSV?nE{nZ2oc?#eXtR6;)f5mT2qGo~Wm|yq0AdMUE1bCNKUT8mR8+}N zJv>k&4vF8LpMO8U>&PDbUe68z!6UF^C<_=57)CsLc;ALy+K(Kp7qgX!@YpBBH|H5u z2A_j{ICHFx9M*m~m+1#48V&)e3Gya|X(vIcgFL>M?|+bO+fgY?-mkRH@0puTqEi3^50Ku>o7+{p2U<)s6hC4u&3Zn+)~K`x;Q&n`H+cYmjS3?yrd z9HfG~FpW~0;zZM2WECZ75K9ml6wm~972+mJg4iY>m$Dgmdp`cEvfc@~V;3TF4lKaR z!5u;tuRqPuFzPlw>4o?*J?P|5jJq=PMj#kA3 zQsy6w>C7~Z9Z0syH7NB=L08b8_+YIU_m`(Clkrz4w6k+*ph3qVNQP1OzzlE2;JNi` zc5S>q7=}N$@TuCFvt8ftl=>X({a71jh-tj#eoqk(kK})R0zSI;-9+o1%bLx1Zl04r zOj+S}8jH7vd?p3Cg7pKl?_e7J+2h6G(18-MEm7+>t>5?f~ zIiq9q+v+Zr>i1SJ2o_$!lqSn>{~mfyASS)*+;tzq!Mg)V*vfO!uRw0T>}GCc@9t_S8sSJG>#tSuOWuO z&HlAJ`1|Xbe=+nKC1*6BT)c8k9h^U$CH4?&L^;LHot}Ghs*7J79#r;boObW`=Kz0& z@S#s&8bKY<>7@;0>aIELw$xXa%sDLSA{Wl0%{shdxCIey$Ny+}W)FVS4itfE^L`mc zK9;T#5cU=fN1KyPE8&=ZI}rPfkid${7>~DoG!4f9B8MwO_o^>O%$q;y>>hp%QPwCm z=(SrflAt6^IM&^G6iX=sq5}x+(PV6=h+1~ocTh)O1rWsA)5*l$xx&~>>s;}H!8yD?SU*2N#oe=k_W zF5^O+WkEI-9qVduz@7^GF{8T66ixB|athQc@}pPQsS*)jv#?RMPKdxIc(!;UO&EQ+ z-*bwm;FUxKOg1$zNl3^$LH~?1)78JJVzd$Ef|uiEdS*}*{pN{8lnJkTwg(R z^xlVbiS38SE{K3yZ{H9kOE*I&;2?gK5^mI@C98kstd^r*S@j9%x`dJ>y%$_;ZQ9z9 z0v%mK`bl*^YZK3}G3((K0xej8a95ifd_2L$b=7(xFIT+<*JA3z-D1fxcc<+~{RD6_ zaT#Ptlw#_H;|qoYm5Y{)nQ`{#t5nf^#m`NpAhKTK)!H@#Thgqv#k|mEILSiL_Y^6f zAiQ6aW|+QASQa`X!B@f>Nnhfb1|suOin6Ai5?4gEeeNKoQxzRh>OBsEGIu$Lg0OGSPp8L-{bvL%D&uCS-a%U^NKIK!j zNOqK`HmGS>4qf9bB-gvGe4}au9O82>cQm~N3Rnm0XP#E*94cN{ynbDN@!~d^y}U!W zr`=AbYNDIfxgOE9hy{{^j*TGAhW&F#jPEgLVmkW@ok)+s@@+8FoTzdvJa`G$7 z&&KEbbRo>hah%)3B&qf&Wr)=`DM9CkX=)neGyM$~{}j|Ckeff08p{xH%^-y%^S36x zBQ?t;QUf%6tg>KA``X{>CC{Tu4V0kd;l-=utHk!jUaHwQ8r!hk>>=|!q9P&IUsrLf zvdeEZK-!z%q@eVU#_&pJyxj52Xj8wc#{w4V#G3-9^N>ugIynBF`3K?2(kK1% z(OkGCn$FwGw4T@|pX>4nj9B*zKNgU<3&UWWz&%DcWe71ZwsqbfYONN-!zt%4XrEz` z1|DmR$bg5DPEw7a*;4iA)r+FaYlA4DI=bY*z4dhYg@Ys2q2f=O+GY&I+J*U;Z-CTG z>w6jO;wpJ8i=Je`^<-J;suF`!1KPtBA{e(&d_9$*46!Ojx|m+JTu8H#TTINgvd#?> zACAcwK|2T=Q7{0_+#S16zr4rbi?KG>g<*{S3LwSOA{ULrv`$7%9O~cTKx-@rMaSy+ zo;kVa*m!~{@BKsrGhM;GkIO-=$>HB=F8}Rx6jY1^q^m&k|1!#hYT&M96W8t?i0Avz zwBi9c(b2_i`dRGTWN_t{r@dC|T86*c=obYA=z9EO4&IZOrc8fX$A&jPG=~|%eUHtF z4q8BHovzDMEg7v>x#ht+WBCZO0k(qTfmc*ku#nlqfZL%A=`YJu2i^IvRS&G6xoTWn zVX*iB3_W*GsP3yd{=P%s{DNI|rwLT9HRD?tWdm3sv;c)<8`DToc_fY~v=^+fmSPli zZkS&PIG5G#!UyZ3<9ND1K8uoxfNaXe9-6pMJ%edfFNcL@2=!m-Lzx!+@O(Y$^5IUR zv081Mu~Nys$6fNTP6a$T-4UE6p35{HKho0(Cz;A|076udExlLM<_!{I{g|q&5^Z_2 z_O`C6rprB@8yycLO(URQU{numU)+IEy%~e`5PFu9acD-lV{2Ou=eu_}=TwUNo0q#k zE@z1MA8%DK;E{IxTv9Xs(MoGsfddlwb_x8N%&TNfD@>$|+>93=e_?mDZuQon(nAle zPyJ)q_Mo+m%rm*+M5keR9bzy;8kwa%MM>^t0f_jt10kMF^mfa&o0ym{gX6^GtgaJho|G9x zURS(v3-M*-DH2__)-E{EXBy_&78_rwvddkqv~W&( zI71#}YbgPBb)MtDKje?%H!H2G8oak&8?pDPVi#L1ty-YuM)Z*h8g2Zt5d=B81TthW z&jXYAp6FqiAly+Vqg>Y;jVr8(E!?(3%&DGi`Owm*kezs|QtL*e%}sNryAbb5QVXUk ze(B7Vk*W#(m9I_g@wVQ5FkITuD|avVn_nz-T<*+%mppuSPeT`|U9`Ls4}*9_E;_wO z_h5I^;FKDCZ{fXhiHIJGAh{G6k=X>7DVaUOTYjaC%A&Ot2=~;1cn!-GnP^zC^H7|5a5uY&vW3V5kDx%H~c-HTAzI@W-m^++m zV^r7ww)|t&i2}B`rFzq3md(U>yExTOAK6O|W3Y8ooq!ZdGm)=AmKIE$YYhjV7?DANHcYhup% z%^r-45$2b8M(|9rz$9o`)4&p`i#Uq1rYFi}B~32%A?7G5RlSyWKbE(~d6Yhvba4tk z;b?f`9#f!Txod9`6X;Fs&?hd`3y>u~zH<(|H?5~k7`XRQ)>LnEqU%6u`U9yuo1jK- z1^uHICruydp{lBeXbJs*kES1C^cPpF>_F~BO#0~7e)d%G*PS}^#W)Az@UxJpX8}2R zSV%SqQbudWKFm$m;d-eI)6mJK%1xUkT;Gs9H8XJl)vMNcdMofoObBX*s?2BunKZ#X zaMZ~ds3A^9!fukD9Ga-|m&l@*+&p(AgKP=ipeLe!&_h;l@%i%vL4*^bJMzTr6_qe&GiN)D28&L{t~4}z<%5Q zsURk-srb{hUYZY^l=6FBS7`KEsz}kd?e`OqKU&Iq7##0Joxt2&8SSBfg}QGrN!+Q< zV$#L4$a(u^PMyEsXPsTnC&hNFOCKjZk>9c^J`^GffQ>NbS$Y{wFVAH{eV`I`mJE+t z!mS_@<|!pw8D2uJ+PA8oF=u^Z3lsQOdl6c3t(fJ<%90RO`3Ly@8+QW^MdKWbS1JsH z3KdQFKQxM6bgAD zj8L6>0ZI|L@!@Uw#u?XHlm3$y*8Q1#1jo9H^-lbl=dLk+rZ?6jngrcWBn1)N<3qG| z12zPaEH~jqmvoV^ivywpGD;ELC%J@gzRZS`jzee(7(fEPWUqM(r;bu*(P8wfG+Qu6 z`@Jb}RZkoWTON>ccXqVO|Be%=ZyHX{XpTM524mkSdQf29WiYrdxV{@?r>zRr-*@~I zm!OHx&r*GcKNNEd14u1rlv@Tb`ox=4cij}Z?>~1fF~4=R@kYBJFH>6ub_D$1aH`&pnzV$6`!EVT%<71vEhg>pK*(a)x>K*$4p$H=g`TGJ%1`J@lC=HcH)kYwlG)<)oXg=!B(6u30;Oq@L5K8!)(h^+E#_} zZXf zoMcAd3EE=aicrXo@zN1J4S}fvA{ao%zZuKs0_NZAT#yDzpz>>%?cmmqzS}`T$N@3u zGQqk^_Dl%EyFpMQM7?OFmcj(|=hM8(=-F5ht#`oY`hs6-@BG}GoKIup#=(4(N8cu$ znr;`?<<38EiC(gpF%5-oOa3vx`Crb5k*dw{k=XJtu~cIuh|IIo1Tx;bX0ZFZF-JCQ z!5)rvJ8J#V_d6$GNy-0`9qG=nf85Xh$FcKJ693W`l^bOVRFNew0ShFRAh&Lhp}vWU z1;eB{S*HUwHO?MKlFo`sGVdzrihZQx-U8t~i{JrMl$&+B*)Wb~g2274OZI}k%0?m+ zgvhg_6Sg6VkIQh;JkaD(7%I&JRh5enN40Lj^4FE1%j^GJ5A{EP{9ocPMc6})!MYs^XN?)KW@YhS#eEBAiw z)8p{h@6ApO%@4JD*uXUKl_eS7gDGEWLg0eV4QZYuJvaMUSN?V7%S%6x#fkB%rLp&~ zK4i%q+1P-9^S`41C2#QG-sWH8I{cr9fPPQsre~o!ffX>4G4>#BTIYhMj9D5fu{Oqh zWk((|9Xo9$c;2%l=b??8FhA2UL?rcBKRFg@(n@2-UKIaZS3U%Log(2SU)y(v)ynIy z?l(fE<5r)tmSb zdY^?k4A|e1$+e%JjJegr#OSm;G)z;(WA?jk*NGe%yqu?$R^D6N=Er;uA@T#A;IiHgD-ZE*6jg!}5T)k#spd#bQ~j|j`a2rfnhLW@wN5fRW~;9DMDLav zQtO)c`#$QUyxmW{F3Uq$YX8Yv@-OZS32)}57p7940O4T5u4T)?;8M=dU=4m^6CM5> z6{IstU8(uLmF_82c?TTIul(W>k;CtJ%%|`U=5T>diXgZ$DMQhxzwwR7fx=Zs$s!e` zXe$ykj;}%@Nmf}p?W)(ogH@ru7tV2vTFIYH+>Ap)Ho?*M-{Q*t8~618;v2YFftyrw z7t8^S^A02mQ&xx#QSMe)AkaUI8NM4!qy!b)9)Nm38M`(1OfvAD$f-xa{LwLsNu6kc z>@hf#%M=Fal1B-W?Jp%whqd-2C%Rfv)Xk<`Hj_rhqK>`Tx76||JM{EEsJU>cF2uk5 zR}S-kVpbxIiL5_KlP5dHRjfoz93s~z<^peT@M@%g?`h} zQDdg2d;dfmi@9Y~_|*`MxjU&=$g3B=Zd5DhiS`{ZLh7e<+0$Bd#iGl#@YxB|oO)-q z&rJN<#nqpdy)ZR04kFhmnZt1Bq9}02nM!KE*vk-b*+1L#VT^fev-|rIqwep;pp5G6 zCW)q30o{*R)la!K%p1FT1Ut=& z@DXENS)Veo>dv_;6qU3Rk$AmDpVNQO>FEfT1f5LI_y2Sff9WYPC4ZHx=Qo;?MfWsq zrt@L=&L2}>H|Mc=sSAbGs$R_aC>%VgNXq2Q|FzExTB;}9TYjbOeO!)}bDhi(z(sUqSKK#+Qi4@pi_k&T|LK$tEtOMbsVypWUUrhu><2MJSz zZ0z$@_B>6o{k%88ZS|YagA?aJ?Hc@79|Sfkj7VLScv8<26B*VH+s`;T z0&dH5ZtOs|4$|NU^l$ZJknGFmu~*LI3To-OZ;gFB7V@j-WYG6krjJg3ha@P*WL$)h zibjNpJ?$&e>72tG@wmPCS(;3@b+b)cmvQ;(?dT3&-*(e1&rn}b;7K|jSKdK1|lnw1*b z%|6b*{MiYKJ>2JhBIe+J!&7HM=AD^3u}*MY7{Ox6fG~~#w>{ZCHUtaSZO_kD9bi-`5rO(Gmh2NZ6){Q=o7jO$Oh+_cB>E z%8Iip7iZm;YN40AcAaC4)IPB46OhUI;n%hvXz7L0w%$`W4qr?vxQ5@}-=PH#L2M}2 zpxhk{-D!7l2$bifS`BY#1pAx0UkUJyj+#jnvVYxHZ+#4ZJkGm1$v~|EA~O7|S69F9 z2mZTn5|jXE#oWT~1(*5O$dXeNsYJTJ=|H_`$E}*;55CkhZ|2Sy@C(-q+*I7;UJ@yX zE)V>{SpjslXbGfU+|3V&Qni4%sfMP}D`d}y1z$$HWTo1Qt>CcQG=DX&uuYTGED<u%cyk^S-WzBV9Ac8cLi2;)%6m1_#%V6+=`!M835cn_nN7yWI`5g_rjt~P? zY-Nahg%mDq6)Nj{WFxZgB!=xkg4rQde}W|Fm?J@SJL;gz)D9#9w;uTB@jYt=uPZhI z7Av|ZoSu%|@(K)P?EocJzx;y?Q&2wnwKvo2i`O1FUHK#=%mWn^TuR*dOG_B`0PBii zMJn2s((~&Aa*$cqGptkGPr7lu?JPTMo&AMab!)mB@I zby@2Em{;Lp@9bPp+1eNcFSkFbqVW#5EiVpXOD1D*vh5CrxlhO9TJkE@v__P_m43F_ zfu#C?yotscsfTWv_#r>Rfm84fIN`gU)(#{~jqJ(b04Y?C_0m(LcOV%)RFL+~wp4C= zU(OCBjDL%TrrFg#x$Yz)3nC1`MXG6QcqtjPwv6z?ydUAG@3{iL98C*jh&WEIY%9j| zflDOcb|9;=un{D7Yg4WoK71#d5hjhnw=#5gAY2OMO}b!vG=_8sv&9#`ScjldQ^#Nn zmAX3+ohMilNF8zn{GFd6sf?JZRxF?br$y^g?Bp21UCUe>qsMk2nV%5ADNF*U`Ewn} zvUkCoI7t(NF?6%2a{N0Gt(i@h?WIJ_S4TOT0QUd#9mD8;9oQcVsf>MyX&7lu4uFI!Jda3iDk7Ghn{5PV&3l?w<+jmQ4@q6wa7n8OaFqa5Q1GOEpv z)q{BZe;(LB5A5Gt(LZn4KTYzVKJ1@P|DRFvKaG;ClccN0BDp8P#i3}@8RaX+GnAi+ zg&V`RYx&UB^@p{=K7R?${`I@S-SZUGp5YA{3>QtWug;j%WA*~V-x3&LXzmlp)~^Oz z(l06c=wtTkN_2&OIc+fBpXJ%jyR1Q%Wc3H)kp>26mcg{!Z{I9f2iF&7&GQXYR1*t^ zHdXoxN{oc|z01lw`Xo+=WlyQbRU=zIXMta*6h=xmcFEz0lT))TqZu)Ql@Gm1A6y=i zuJPr0NQP_nJQn9QWV~D1%BF#|1)vDZ@*m53IqwqTQOn@c=~}CZY?*Grx)I-|=|js~ z{NPY(m&WSr5V#na+C7Is>$s@8G%GUZk(RtwRTSZ?tiwkYe;-BhnVEs2XHKU-Km9iA zqIRO;#S}BsI+h_orC5VS2^8;1J8+J{3&AwQKnnJCZA}jFXLZIjg>V55D7zr$f{@Z6crRhZ7hwjpnBMrSG zYyirHkH>5}MASUXyOb+xDih*U&Lc@aZ&_H5dxx@uvxR0jNiT!nO$8%>@+56}2+vtZ zMsgON&@FvU#LcrNhAdtjR}*tk6a2CV0=IPj$8@uxrHW#A7KHT83|OnXU^+>A(Ag#| z;ph%*xqmd)#hAs7AwiC8e2E|oeu*zM%xQJWH4w{ii8ZxBIz1AtG333$8}nXgIzJ@n z^#9?mkj>$xq}wbe51@>vqN`!J*)%-g}M`@3O!5d_4z~ zDok468h^R_o`~9?Vwi&4E0OEQ2j-G~Fpl;WR_v9or>X9byiO54!@8&^TH$0Jt4{LD ze_!|I^PZcJJ}nen=FEC9)U5#rWIhBcrUNi?WDBB1GX6w4htt)%K`nOV>viopjc2gt z)GNXdO)|Ee28v+;=Y;tY8ThrB9K3jsu_e5bvI7y_j&5dX#`BsY@0evh46QYEDf(`h zg>XR)#dU2Q5`10#P%`vdk~FWny8Q7^7j&3?nJpfm)3pAW$^Qe9`%H~m`_O}MQWZlJ zWegkzt+v?)v_gW-m;k);WcK);R}U`*>@5zkDXBkIv-!p*?#UlsLh{0jz^5Zo=RuOj zs}?(uzSAHzMGu3u7d%Y8R@gSF54-LNvY%!BC-(+@7SnhXJ3XrjvgO$8ZZck+kYd{S zgZ}fchk0l@G0Q-7w;6NUfe@CwOim{JaQox~OgUdJ_it>^wH{MODo zyu7BO@;R#YV)4<2)vdY+R9!~!RuYI)Cg`Je46$FL{&NlpMPzvGM76jWUFx7ad zC;Yvrp6XCsLW;0h!)-~Lury63RVdQ3PV-0oJMtWmKFN8zzhli>0MPI;Kzz|KKuz5X z1}SNow}9wikPX_HWS;fTRw(wIUW7Y4p|kVY(1keR^w|#X>2!n6qAtW;YTzLmKdqQ- zlmFyG{M~KAKhoqAHhxgAj5u%{Gas_4^w00J^k7x6Ajxb=@AVOc;R(%j5hW zS5&KMny|S4u}&<@Uh)(}-@nC}dQcfd7flI)gPfT7SlFr(%5(=(#32JVQ?8q^%q#!9 zxbT;Yz%ngK;}K$9zS|0?h#@d@O-&(iyMNGF5z4xcy_k7_C|4kaSbvbl2Uw8K>D7PL zIsk}XCVk1sv#WpQyF01a%IOBwp*m>~FC=TmL>`jKiTjP~V`_$b9MfA&Cc~NIfkcCT z`uf|$pXX6<<*lqKu5gH4(a(^kn>rGx7Bu2p|rpJs01mAoyLixpYd|X=!hEAj5a5?W`+%v2t{A*sUR_U*^7w z_~Q{HoA}LUM3l%0AO|1MJnDj$BG9bw@Nf^##QrdJG{CQ2Sh-6VHwIfFaaezsS|t;=rQmJU6zvq=1C=C^xd%c1JzmeA8#=S(i-Z|8@kp3!KAkqi+-56RepoV(~f z_T_E!8X`?g9smQBSZ(7Z1=7C*S$`?BA&jN>r2-CHqCi$Od>Ru6TJ9ibz2>l)Z-&;N z=ft3MAVb1M%$AD#IYt{DY>(DcCG1>3m`Cmw$ zp`d_OWULk+NKX5b>|_f}76NY#$V>g#Es^MDNy~c)jtaZYT`Xa;`VUp|QDHC0AzT5! z{fYi7$jM)${@?hGT@fE&e*+$$8$fB~9}SYd(&r|VeArsrBr7iX=~ouwa5&0>+|FRj zATL<=S8(mq%^g9s4aKE;*O{k{KkED-^E@DO{N}>ios;uj=^-#3O!MpxWPTL0O5TAi z9|B9FwH?TYJ9Y~+kWCJ-*4f&DY*$Xw>M+y@u%trmfHdl!3_Ccz87!8xFpNU5w=D-_ z5Ppt?;5!iWQE(;zHvPT6g5?$$c_7%9cOWz1pk+)6EZP2V zA8ikSQglMZSchhdZKtq$R-ChX#Q+yaeb!wf-$$F>f6Mw63qyO=U-$Fdut(9+Bn5wC1eD}L|&#HXd4&60b;4o1W4&yD;I zKJfFYlEk~~8ZSo1MqDF=h=zR)(F?pcf42OfzQ)^pK1CJ;G%EeTei7>g4_VLKFVlyO z2c5SC(>cg159*Itj=9=#4;?VAtavecae+5+DFNagX`TE7h$Ie+a<4_U73E8s$gMkSkYVm&MAM<$bAaNF@elqX;i_*KP$Afksa?Yj<}?E zz#V6}H1fCxZ9CHVMY{dKMYHt7@Bo>L_Y<{(gPkSiKEUndqppgNV;p_oS=6U~@9VL# z`HU;FBzt#hu`he-BnlB#*L zl@|AMWcgMArzrfT^Udkh(znOs)3$SQb5>`3`yIcVldD?eA*j`z4g4YbkraOZlh%qxlo5uwlkm}()4cS`n~`3T z!D96uvN>xS>`PkC|LSd;OUdc?bCG4HB%{L5vF}{W$=AnP>~G{JL9Ru#ahdiHFIegw zyzHA3X*}`Bh}L6XZy3`&Gf;Lhkv@1~ir+V=Jy_M&=+oHh@AJd|mX@f1y%z^GI0Sr+ zFY3(qr2E6`yfDpF?D~`7$5tb|IRy zcCk(x-+g&}pgV{w?NfMEV%2igiP+>Taial0AZLhwG%zXD84%!mkzJS7^`dFYIc_N@ z{Do{Ue$jYpurYE(bUl+3#t;mnfTo{?1e|st50WEyAcsKT`#*h=i<6#N2WC9Xi}dHm zRM2MtV^Z5nivpD5$uTFw-M_qH@QvO(a{?KqdOJDM*xr~Iig%-`(Ol@I0vJd17dQvz zc29)A`XQYGCLBms2nQrnUQWr%GzGRdyerxT7`BRV zRUkiGOtAHf^;c{OI_%r*NO168UFAxduYK++4T7G;deJ9P^1$KzQKTGYksd~4!9NpE zuo(sVwMmZ2lyKzu)5V8r3fi4!P96cVyCK2Bc+3M>Noq?EmktpdImrgHQAF$2>_996 z-T-%v(vJn@$q3?_HS-z|v+_c*VB@cm3JQw!)}a(ZT<^u^9Z2k7+JS%j>h2{W3L4l; z==!l?%!S%d;!F+Nzegilj>Agl`2LQgee!y<+K1_XTgm*Uvi?0`q%)hlWlaIp$Za{6 z;&A^fv{P#UN`#dnmJkGE+*D)Qa12eFVC1P2d46zGKaIom>fPd4v3&&(-urbZ)O<>> zx0Yj#bwPP#olmtxoVE?Fe^2KdR%_qepe(tZ_R#r{Z`6DwaoDt^fkb*-4_!n~4aq)3g(MW&L-s8R5gA5Gc8U;T2$3~gc4Oa{?0d}E z_ZiC=%uL_+{rQ~l`P}Ee&wb8)-{v=qbd-QxeNf+WGV%>Z@C~;Zu^|EeAq{$U@ zTy|MjqnULzia%*<_=t&fRd%4*q*`5?z-A^qq9A}1!<4@;LEA^&9eV1l?jq%^)j1WM zT5|lHI$u3q1r^A8kC%|ksd8Ip-3Ib`?NW6|=10>HJM!B}+!jp=itmNV*erd{(v^-i z!AQJo#N>l=&5izDxh)O=Xtqp5HQy%~VGnmS2Om4)-wV*)uaf-ThDL zqPH54R7~_oQ;t@AL=fDBAE4Gc@E~AXCa(K8hh->IxNWAoQ@Igb`8~%66};x zTzY(eeZrKhY^SIQyM+lX4<<7-;b@ILLN$bBk@sn?t;d1jeeqvCH!1rAodeo(2p2;> z_sbvV_sIuWn0+#7*6H|?v+5R_HG)g7^jLEmNAbI!CEDUzW3e%+IMsYR2M>%3>#Dg; zzcV{blvPJ{k{<5X37Sp{cvh(8T3@1|qQE4iDWH>HYM|+f+8a6y!P@GDVgw1N6*4(R z&v6dh_;gg{c8-ouPdbSjR9fi*C=T+PJuW1hFxOlK4&GHXJ5+?O8<}`BjCBZ1KW#Yp zW>IW{{G_6HM_-KMk_Np@FN3G2K2UBhQM_?Jftu-V^WDzu`)OwHIOlpq?R(Uo?ha_J zjtp*!5C$=j=J$SLh{h7EWdZr0+!x-N3VNEY#Y>_L5Rz`? z3aP)xRRsesN&vtobd)$k@Y1X1hob*OH&Fvu^AC`^+PGB0&>u03KR0Us+n=Kjh4YZ| z>bQvZgeDxzY}V_iQD-OC5Oj}I2JfXD_Va$Nv`$EKrxhxye|ttcXH$pGP0}Ci^m}IF zx1IL?<`-~~32J;Nf%52@(#Q|c*%}+piJk0gWUhgCrmHbA`)biy`xez#Gnw#wKz|5Ec#C`ts zz16a^q_$jrHW{qvHN|YPt^$Jl&LIf`&}6jB*(vyKQly`N6NE|u2)HW5=|`8fhQQU% z^M8G`q2psQ$MrGwbTSP8ov?5uXV!p5PG-#}fB%r>9p(aVhMc-SnyAvy%ls+A%~GI% z;*8IA0KGlKE@9%EkYRP4VJ)cBH4Yhs81;C{4R+toc3w-#VwI!*ip@yMGZ=rlKidfX zSER{L^DnC%WX=F(BWe$bmZ1Xc#J|<3eVR1ua&mXgFjZ5yH+@<%or#ZH{-yiSmB-4* z{r{%ZA2-{PE21M44s7)=j6Xoj4qEaFLj*sk9W|gpEMh+1OP*ZMjv&Wm`lSQNqzce3 zo<{H?m|X3MQMj4C!$NXAZ%#qUQseLO51>XkwLq5z(#J{kx+`Seg?@DeDwN>k9B{ej zn{V>|%gu9jYV<)dJ$hqSwD z?|TJj0eup5qoawUWR+>ib)3Kpl&$p_Y=$I%<+;$uLpLj9ZR7Wgow+o!Jt;XCROR*G zEe|)^D3tm^O%A44$#&NyApw0}D=?`cb2>nNn_5=)`aadpodss5T==35H8ouv_q zJ=nDg&1GkUJBpgfxxPy4*;Ki>B`tEU_R!w5P7I(&gs@iRrsrUC@bfJoSEa%N)&lN8 zioHX0+)DvS@H}mDJYzN?(8A`kUq1{iBE=yin(7PF5n<{Nx)Gcd{<6(@t@XNO_p<8k z)d6A@Wl-*L(-Sj0o$w^AZlTv=FU*-95xuiaGRt2n=yzl}J-PY9va}Mi zS^(m_)T-V_y1Lm|(tPn9X5@n4&{9UV01!Sjv$NHeb^Wfadnxar=)F^K%=5~ojq4Dz;ejhXA|CzuQblV|>TD7g6~s@M@DzN;ENdCObk%wih_wKKx8yu|OD|1OFrBpmy^O5?0Xp|sLE5{}OKaFxQ|H^wHHpdF>DDb&tS4lx+UG^R6!e`= zT5@qZy^NLeDfsH^Ja5;)8m?+^x+%!Q?5eBV!qWwh&Lx&S?=rLPewUGnNGu#x9G8iy zzr&2b1MUJbBm+eaGvGf8TS5>z8o%|C)5eq)~7c;9?g%`Pb(8Y&) zlEwvBWsPGOG`?%AMn_Cm*{P$&4rIlE-V9?9;5zuGUj@|3-ImHHv!-wZWGA1hDS*rz z$&a?TxE!h$U(T7Ig`e)Xh?9o|CzhpTISD=ynf2sfC)2UcB91}s3U#zJX0q>et!(J4 z7xIzCx;CNv7n;$o2WA{$;+k^r+1<1F$U`5aGYJ}qMY}Y=Z{I0RkhETAKR}w;$<+z- zSA`#)Y6a;g7rxNstiJG;Ihj6Yy-QUqymYEZH7V_k{h;I8QC2I#8|fqLCmQjb+*!a+ zr z;WeEEW_s&k{BfRY!@l?V{;rD;XI+)JH2+l=!7FQn^2m~SjFgzq2IckY6&M+(H1YDe zb8PK?g}~+u8szlL@r+AZN8_pBl(t})3u^L`GqxZZFimsV&MmT~LCydnnoLQ&m(}@g z{$aScOdrFoOO%r+yR9*oS@1eeV(pMR2xv|jHXU9Ly^Ny_>rqvY5?=1U)~GJQs=*(N|!X74(S8 zls6t@w_ZLMVKo~SNoC&sEjcpwse5va(Y#=q49DHz`VR3jkQ*Rh`O2~(btKxy9dzUj zXOB0)u0KAn9b88f)>mSYL|y3*;^jNK`2=d1=npL==;CtG+HGL4SSXaqYwI?S9~rpA z-s|#xGN#`)q=-+1+?9iL1Z`yhdg1;qT=*w1QpDU~Zq`1I<9mB6lZ#7PU+0Q4b@l~+ zg~3^|^%-mw-VsaALL*;6x#DjZ4R1~AR5MI>xous(o9gm5geE6dkfh$h_Xe!>b6ELr z57$XqMX)9C>WR8FpJwn~E>@AftWz^QM5 zUUxnLmg7FX4mdog=gjUMYJB`&xq?=t*eWX}x*l8ys-B*u0Kw(c^7+I_%$Af*Yk zEnC`;1QQU?$k9>7PLkO_KZ}36dJ67y@f%K9HT=`z-5xL|E7;}gTCLf5!G;cw#W*4}N~r{NYF95EGR6dcvtADI+Ao z$n5GXCkFl_XG*POYr2(v)04-LZ@Bx;--LY!Obs{`;Sx0|v~Nb!-?zR5Selo)tAYbw z|7B^^wo7GMEuw@987fl2jwIk%Te$d}muw6WVs2rH-}!OxB5f;h2Ej>Yt6=ek6p5xs ze9_)2ra`JA94=-e!?U zk*B~>aE`~GK(L?@dsBHrrVTeef!2E-UUat5W(RzgVo$oEaDs7mbb|tjWw{XfaO8x- z#!z<5_sc+o@1@d_&ErMxZQZHd&ENi2smg!b8KM?DPMaL;k{kcSAnouLGGRbbj>IsGLw%nvYU)u z^lke2yVECz$aecdlADn6=XNB@e8g5KARXr($=ZR{$ReMRy9Qy)Tuwhgeg?!#_%c`m z3Eh1Zgf~n00a98sKiWp2)yh5-`?k*5O~HzH`wpoA=4B=p5Wgz^GzKqsD4G=i3Gq#C zluRrn+}m(ONFFsM?1R^9CCM*FQ2Psr#waJ<1LSwk|BUtjg?UB$$*oMl5mazzKF@Qj zU1?@#8!sM*T&~R>YG2h{e9GuD7q$4*f{$E8_dIs0$H(SYBS zQB?Zl{ht2nmnk^S?QcB8GUAo_Lt=>{$OraQFUwQJ;{=N5 z3c3~_r@RVViz^eCTFm&~n9bC*#d&qneTjD8;W?w+Bnhyx}Z zp`ZhkIWZz*$tor_Z{ka+l2SJWUio;M1H*JhoZk*%GbBcBs0-9ag%_25rvGVw^ z0Yn*;5r~dy4=>yVyJtyyOXKV6oU1V^Y;&G83^KqMXa1RW{rN3S-#d`Opg@1h`u@eU zU#EK_seJ54{1v3}Dum=C6}S#ot2~!H06XOjKap?bBr!f_JlP_A0n>l);D7^BqfA-&5-M*QHPgvhucx}c0oYZ0)%81 zH5muK?`v_y32-dAL_ls`WDF=ARA@-+N4#Upf5O`ZQvvxVxpU^tkO&yF!U-JhZ1Ps1 zg4pba(geZXERJnJ(UM*$r>e%EuX^tOJ66J7v`G@fpKVaYt%%#fml0@>q*fMz)Gyhx zuR|#?iOHKKK-g%gzgB+UP<;Hsv@JL~i&2qJSPVD-6xwb&6skd{^7qgzI4pSjx9#7c!!r zb)4$;v*Ex#>7BNxT>hsDUT}h+EYNHcAWR+(Bhb>{M8y0!y|&rtTFOHeE((l2KIb=} z&ThdcJO73DJeiOR7KSjwT(rkQBqWf`|1xH7zbV%YhUUl4o9S2EfyhE(Phj4G$CF-j zO*{uW+|CMYxMIQh=wc%%3t@9)PWS$d>qmg)OLz1n98Km=E%(48 zPr>I01wdHitrQR^_yM{d(g2{us-N)ba}*Qw&aZyLZ?66SeJABB0-*KfGXZpMc#2wp zRMj*qPyv#g6kvq8gJc=lelUO{z`sBdcH`WC3PpI^H9;n<&@8}Z7+fb!Fq(Dt^g*2^ zLaUq<9oa;<`Haq~g%HavLXLHCoae0*2g;O!b^p)~=iqzW#GxS^8XRhr5P9syP|Vm} zN|DF;h0bZU#TB8L?-;4L%a%`8E?ns>{xTbIk|(m1LImVZ-PSkJHmkNi`iTP->L*)P zih3(OX+MHoVUHdC)a4=XuP6;%Ir>0LL{=LTAoUHz&^9YWm?R ziTYTqRXa~W8ns8#H|j4$N>Tv%81d8)by#n#$DBE5m%xN5aN&4EC!Y_d+<8lNa9*~T zO)cSNz=je;?e~Vb;TyOu{8<78ZgsCl*C(|P4P)m$z>_ayy8~KD0-Payg4gc2M^C5onnV5n|G;X_`Jn_e5X~2=f1e)i>a3_AfIHwU=2TYtz^A&Jp5&f zaswn8B=@I|BxO7S@rfugcr~}8j(N<>a`2ANO5z3_#$5A+V=C%uI}ZhXYR=Q|aYE{u zk3V|=1)JVbdbW)Pz)4>4hg~M~c!|1<&CNw-VUG+0^U3U9Bb%DKjfo%Tm95v+5^UGK zOkd?}Klgs@cSOHidB%bBYBEi4e_S76Q~3R)F+nHvdL4xOD7YPmhLiMnf>f(JVLs-3 zBnl%#DjfjgMPlk^$qo)#wj#%V`;`9EWA;Z3FIWhtf(nM&YX{3A*1NC;x>CJaf*lxN zefl~s0c}+(oq5;lRGzod`mh=Tc#w7Xe_iGW=@U<6bs<%r$vBsSzxUY&WO zbT013eB*C@&HufE{^q;?>qQyuD9E--jmrQC~)zQxLCU+d^v8F_04k-KIp_+JY&* zI-csm{PUrUn~%=vS6WjfMB-gOM4NudWl)m)r|{rsQ`duo1n+`R+8dBB;j~KZGFI3h z(7Aj2cuTa<)!dfo-JAklqQt$#?*X!z_W%dL^&QA(g*C`mPcxzy7X$D_TaF&@=VzO? zR33RZ!zEwt%tC}gej#Jz2}DU-BY3Yc84@eOc_gGxU3ve2@!tD5j1TBB3TlIaBb+U4eD7F?O+5gT((4Cv z8}hAn=b}ieFEHN6rN4Ge$Si?WXU`mcq>OsAS7=V1Z284ORieg6(Ql~|```w8L<8BW zmoxYzhdd>6ZpIT?MVv0^y}s0dDnEm~TwYCzRaoNO`ikR93RWAowm2NksrZy-?6ma* zvWpJ};AF6|@xilN9LL^DGZ;+r)J3;2tVApdXzV3<=D zKvgkwf*24I1rt<%fSeJBS0_$mM-XygBLH;l%iITQwt@dMrT>}I|LoHL9Q^;>rT@80|6`v1|7P)nn$WOG zw^fz$m-xF0eC~k~gxCFNA#S}O;;l5?U*Qy;7w_x33INFfxGP#t8|>Exzz?++pXPEx zNZ~Ss1}@j;+-+=HUpLIBebas{wNJBAuMHyP2Bs2&wOFzUn+tBw0HB0-RRTsUq44Q; zzMU%suWT0dxFekL^RW8UtxDj z8xochk+pQWoFuAzy8DTt^=lou+3?Wi`uiR36DZv{j&Us$na;z>ezAslRl_^hwJ||L zK3Dq?-)h{`9KTZyDj>YQT{2e^Bau|817=!98MGIuCN^)~RehqJUHG=}MXm<30-x@? zj}!f7c+Y-r>*Y=~s;es=6kZ&X5^5I_asJkEb*G&{X2`?Vs-AnXW?47Z z=#sK&ZS+)wfPnRrkfZBx`e=hdze`?T5*{L6X^3N9u)ybqK|41K*DvmK!E~AtymEbX zgu*5>@xjPKw5eN{UNOT}e$0Ty1!@pOC09M~P9cmju<#|aq*=MuWIUJkewLheoMZP= z24Mypz-4|!qa9+#W%7+|6t#hyg}Ro0^?cpO+Urzgoe-9_p?Avd5Jqq}&oOXaocG+7 zf2a(|))WGSx@22^#bx|^8HU$zdQvptcuQiInrFhkbK_J|+2%wol=S}BXC*?)jy+dn zuG|5cEF}cQ@#Wxp)p@v=ezhNeYok*`a8{TZfE)?nrW%4?Al|_-*F}7E=-TNz-_>=V z`jI%#LB`uJo*x6W6<_dBVo~@}QXTZX-^la1Wl#R4Ce)ccyO}I|4IZ#tr`BXxV-{Fc z{8h&-pgeH^{W~GUE~>ogG^H-y`F_e1(wTYHM^WxkwdnK=!1D1w`{JI z4O_*$Dc4nJJA?0c7GR*#e@O2cHY9(eHJ_I8&LOSGD0_qvuk-?qBqGUXu3 zE^0ypgIsT$%dYXjwwYX!r{E$dL(Tf`y@ai1h0|AfG@X`6xgQE5O_Bh2?hz^wSyuik zG7GKoBGW6b@v2-~kmx5mciT3$r!w46`LGx7%2UzIh=6<+{-}db-&>-li`6Wbg;b(LDIjAY5uK+14ak4pSe0cKn@k(TC39 zhZUapPi+UtE;;)9l>O0%|MFr4_7Zg~gjFCJF`JE|a2|0uu01>bq?k%xeOpYHH!T%h zvixgO?sqNVZ*zJypnsJ%Ja~qGJ1V73NB)L9?NQ6p6fd{8k(obvBg>?&L-f?tN!^#> z$F#ME_u;bFQv?KB26ClNmUDEhM_Eyx9BLd2PmW8B@xpB3LkOgg<^DW``zgo2?Xg51 zsofje8gV#%8~rusg#Ph2XO1P!alB*HeeqiJjo#s0vyVe7u_&~-HXo=Db|8Mw@BwFS zAxzSQFG>ry5!5h4LFD^_MK&LWxubodv>L4GRif4vUdP@y<{{)|x3{nF9h5Xkp&ts` z8m_k*hST_sXx55kT{~Kit3^mghZP!ECqqJ}NlQHd^QWMVm25Vr6{Gftf>$R@hjUGO zEDB5r4v#j{yR*+6w4Tz}e<*Rru+YzG!%acdDX6V&?R_lYW)kN?N}tUEVN_0Bo&;e7=~W@>be z$(&2{TIY0bJF;vcwuz$d51Fh&lk^WKw95gAgWU%>_$vEOrP-vs%P0I~)>|*;(8vpWF3v8#kx}@d z;XOzoP$P#qG9hz(a(N=M1TP}GYgqp!FcvN5>hBLSyt%5Kc4)u?zW3TLrPG5W$*Sk# zGhfEiXB$tuzuYqnw{ja6szp$~W=E~hL7G~z*Wom{ms7s4>ZMe%jlQLQ<)mjwl$Z0& zHOca?;w{fpy5zk2@)aJ4_-r3(drs#&Iq$4h_IHKi>jo!8@QZ_+icxr#A)qfvUIdRALvMV_km|&5@zD*`Gig{&%|hLO}#_1fsFg521}0D zeYz7ui`ojm*!BO`n3@-?zLo=DJV?F**R3VYVImmsV(CUQOzQDp-(D|Keq`fTc`0=E zBs|kXNL=(3HJkA7Z>i3$a-`u0+)1=38ze=?oO$p>f@CkZqR_3cHqwbnv*7590)cEN zlF;|*W1>_2{Vu_62vJ8yhz2EvrwW?R*rOJ?&3Gr?mJIueGYPJ)Sq`qz6dc31<7GG& z+)gRLbC~Z_<`cXDF?Rx~2>!(KgAw*?$n2!ilbcnW`P>_Ib$aE=JkahO_hq0c#3v{O z(Qx+Zt#AVfv!9%~=`onM9eP8AXOg_pynaBmV4~O^1#I9?!Pj-0S%{!5!Hj&dCJeFy zGe6J~elA1&+Y+r_H%=dYK!BHBY2K2Xn{Po(V~!JqsAryp-;O zKx+OVE)WyYej^Go8w_QtVCca%&3($ZRK24=fsg0Yd+f9zVE zbi_Hhd(%9*A(oW$t?KQYc6EK78^`ECer0}5Fzh|l3BN&o&pHt9pcFgW+-^yWH=Pub zN&+WpTyG1Sjq^{gcgw7IPKDV2-T>uaV2YYT#A${*UurVXeu)sTZ0Dl7#Ri?fOC9u6 z30Z+il>N-HHK+(~slnF--7@kaU-Og6xFg+pr}uB^YZ?|?7;TIauN`=9-gub=)R4{p z-!OMANLIwcWdy*;uHPAj?W!z*e6#+{3jY=i`^zSvZh(D#MoejI&^}_>Ly(vi;iCZe z+MZZW`iq$K2WZ-6o7i7#9tz?A)pt?90%TS*!DWyM1GHyt*0NkgZ$eg+r|6bN*|UJD zSh#mIMVaK=@VAMFHHD0-De7UFvYD4<#;+CN#~=4;fOW(h^}?Q4b8nen^3Ljml$pO$ zF!wF+qQtNS)d)$43d1jED;&O#Ic;mir|Mhjr*ERxIRBaT>rgiw^v7Fmh%rbodeU=PU&f7 z-kj0!d$bIdbuy5nUT^v6B`Tt$Dczhn@;1u>e`ptbzPe`OtuTI#J)nMew;sXS1@S-L6z+Mt>F*r%P_ zavsH{RY~;J6(P)i4zvo~59cn1U3)&iCp8n=Al&*m9{(|3l3>=?rEW6N^=%^aqG<)m za({&+&z&&WBd>X9o$&3~as~!5*5SSG!O$Xm+Fs zhL-YbJ-!k<>i)4nlIIt|{0!q<8}Y3R2M|Z40zv^QQqL<{h~d~Dpr2Hvvp+!J&#y-1 z!K{!6QiwnVUdRx!oMv8&wDtimuRk1XrI?*LhH&Ikn^S86q9o=tn`HX66qYD--N#3k z!!9;r=du;XdCxGl)|sKX_b4(>hjZ{&P}lrVPT8+uEQPkw$kjfu{r&{G+=n?O`ICu; z_cQm)jW(tOk@{IwwG6&LRV#o)LkK2V{9~mw{X7?|mVrEb26( z;f8`5rjMXS)WF8K7>@?)Uwy5pt&5e)Txb^m+?k;*Qo{OSdIO{8~wzfk==qCL*NaFW+F5z8vz|5&ZZhw?}srX8`op(F^8h zC%U+AQT-(0AqZ)cD9QXlq+JXQFjC)ukRB}~nFfG5N@;V4;MhYQ!73f?AFhWvhsxvy+1Lcl&a$FnSjJ7QremWY{?IhH!XR9>EF_fDt^w7r8`q)Vj zcjZ4{TuS~#A+Hqgg3PFdDqUj6V7SVuEwyo;w|SCcqCp zrUFKMJR?2sj*Uz3G7EM#_&4-QzEJ}08vjgm_%ofwFEIE&_Futa&?`ScojPP-rpFQr z5X)hmpd+n6reyfXPx#Bo{&(biU>3q0cC@Vu6<))OGePG)_a&F0B_*!KBLw2&@7=oM zDSKY;i=ms^UH9!-(lxaNi%aNS8^UDv{qmZ!64Ozawa?2ILIV-9{kb3G@7p<&#Yi~{ z2A`(v*$o;@d7Bu`R7bWx&kzD=vu97A6y}XuKT?jrHH52m-=MxC2mXR zbu5mJWAd$cH99!Q^3+BgtAgCk?sfki>%l3|7IkppJMfe*bX(+5$2~W(12*&zP#dIs zrQrvt=Eg9@(60@)4Wy6L05DGf3&7}JNBKb8@p6A+OHjVT3besqp%~!?7z8B&+$QH& zQb3ScT2EYf$I&$2$@S*5a4M9GP5pZPS!LPOGykY?JD8sI_3`GTA0TVMu1Okq_g8y8 z_X-nx>PFv&$y1wCf(QXp0glpqSvWloVsg&ax6V|*bGYDI1o?L3I-{a%<=EVaKTIY2 zms{!#CH_TG=@6hYRxPS4S5N)Wm{?pT(+*pX_DkJXJPFh(-Xicyb*~i=^-36!{2H1U z^tD=IH~nET(o0H^?7uU|zcQgeGvW|Mz;&~ZhK=oDbXC_g#!M~?x;E7K(-Q)KpTNxm zIBH%4rOS0E34;$JdoQA@aQ8|RXD@xnwm49#zS?Un9e4wqgqBk5_xutXP|PFCdLllh zABn;B%Yo7|>Dnh(V{$6m8PpzctIp^>)#skQOWlUTs)q0ZdgpX{&r9WpCMWtp?>8k} zw_Y3obRyR;P|M#r_dhG~Jr89i@&Fw+tyHS8TaBpWD}HQ)E-u_;%tWI|$i*gqT4ck0 zifgK=4dKn>{OY%2jZn%E`EjX$#~f4KpE`9SC?QVD7v z52Znjp>hEw$>+7j3KaR~Flu9lBq%^-M_v<)UyN8K-aYsS^1a{s$^TX;SWlpyg*Czg zzl-1uB5NpTeMBH;xU$hnkI6AJRi}<}$+)H>!v;HB!57Mw>THYt@~i*gj4Mdu0gw=L zn}7rqg93oP9`J|S0j669(Q%akATWzbbCSSSibIxPnW-m0)&v0VLg30kjK~0xFAk^! z4IaAwyTOBWi`ZWqJbe5IF!X0<=&$Dsz603W6TqP439&%2muo0sM}xe|iS#>U&7Bq$ zURL`DDC62YihbW-ot6benH}*1q$TQzVGsFJ4SncU;I~{ww3p$u$*58!iN#$BlD_-* zp8B^ZUQOY;7oUIo%Hny=_DLf*@i#t@=OFCNI164MfI8MQ$JhMm28oC~LcT|>t*OPP z6chcyo#l|SyOMNmwnjTUjNgFn%|9=J|IfDPZ&AcR!2Fq^{k!a{EL`x?pi_#nLczvB@H5EQNMn&B*W?qm`pt}w!FN~ z<3ewGz+xb%cws8=)Sj$4KEn4=aZ)_BqgS`C`&xTJd@mrIMZR58xW*7)t)Dp)*Jgpe zF{e!waq_j&7=bH_zQCB%P0^YfXgw394|&LC_@d#`9G5cJEgq!2dV;1 z-&+kA_cG5d#cZ*Z$da6ZR_!9snyWs%Jt9rC|@GE%(icOJQp)3}ht$WFyEw0Amh z{5WGcrfPXXR*&XvepK_?ML$EK(wgZ~zt8X1&;U#3Qb4v^%N050JJ3iXkKoDsW#+<)2hSI4-C z=ne$RFXE`lEv6(a3H3j11;T!HC;jaq{>5JuJAFhZ7V@1fi~-3D z5J|2)VxqvzVAc8#r2EbRV0HmUelu~Gdiqz_19dEU7`gojaSpbwF@o5QGzJj{|LdB& zFxmn;Bo82VO?yCoQUCm!qu?YX-U^HduziIH%btH`6+V|_>#rb>SNVW1MAlbz1Gc^( z!UZFluSrVaPI2xPzQ#X2W5=&py}!}1BfftCWDxwn&gi+r7&k?Q$?Sx$n*-iFDReaC zTbU7|RWjvnY`5mmswy*2$Ej#@sY)1Vxf1)oatRop+BZ+wiK+)^f)p_V8asFgr`-$- zGZPEwJ|*(BT2qvGU7;ewMxwhr&&Hj{BBq^2b{*O$-*e@FU(O58%}uvFKVO(@P(0B& z0=TooEttz_AsL+jlu2Rx8kC9<)cXA-p&A48hf3it&ICK><+XnPxM~npWpzsMwj_fg zPT;zi_*a<=HN+ioU@K7D8!pVm0WS`nI9!{|>RtQ%T9j$^TO@yC*{=PH-z6gdAn_i? zs784X7EyQM+f&&KH_Hc{rDWE5B4U+1MaRL_Y(Me)b0Ak6?^0|jGv~E)l1L+&N|=Zo zAhqQchF%;D5|MML!&f%JPmLDBg>aJ(2%4Eh^$Hv^&~^Y;x_k5?Y_B%@yPkc6wz&Ge zCK;ZGkax50NCU+*PJLbNE1#8ha(g-l9(>OCD4%vo=noJ0{5rz>Mw~&(Ky$RAlR1|k zl+f4koevjRnc?N$P^8TKH8yklgMO6*ldTb}TN6)8GTRk3`ON#7Zx@)vuU>A|6a2m% zU7O$U2}HdQIQtvWX*%>;I;(rKJsjwoLxBxNNYiB~FJa{9hAE%ZwW0_|CG4$&laV?y z)(0lx$8Hq(EPefI#eTpt8cX8{pDLU-J}Bth8c*!4N&!#0smM=SxnrJ1JSOuJpVSc4 z-Eqn_r_kDT#(amd-EyxypRorv<&?)oi*r)&u+G!v-GYhAx^*+Tt>?&w+xs|{aeYj5 zyLbDrv=+u}TPE2waR16`RnXOx3A0v_^59egGtRu>&?Ug{O5YsplvlyBL;0caVj=Tx zoQ`CUaM~L*l?`9U!}q>GgHrKRe#87~JC=WBbk>$Z{Je)F?1nCns6i<`*K-mR~A@ ztpwHu8;mw=oVXK?l!<1e3iQ}?%@H^k_q0!oXWVET44V-AjPzyI z8Gi$2T4yG`-UmUgNGyb=VyCciE1Kc}{%i%WOsuQZ>72=@N2G=~|XSc&UMOr#vMMtZz z+4+jC6{fLPF?|^xiGMi)__rrv8&&`TTit{toR0#8mLow3x*R{!~8!!jdMA=5`V7dAqFq@@UnKg)da=3*sv;*UZdp%tiCk_t>@0)oyP2 z<{u{QEnWA-in@yR^8GwKuiveoB@8HMb}+*IajIvmie&KeMK$B9hjNJ|oVE zRq(>44qE|#5F6iUd#gm!JIjLcMZU>ms4GGD>I^GTKR(}mPCtP%D{z;CG0owbaHij+ z7*6gbk>_gAn1b(4R7sCc$F`%l^~{#-!8|G!p?{RHiZK{|Qq+VBsYdax$Pv;z{(|)W zy^5cg@MAd0hD9FyJ|W8jh;te9N6qHa>2_gNrr#AMh;H00_SYKcde3_EjIm1iYs+JM z_d)xK|H#9+fuAf-S;K1)(zt3gdBuVj-vn|o)099OlwpA&csbylKOPcB)+6Y5Gb_WV zTXM%AD9G2jax|{yM_s@7+BchW_t*5+Y#q#W(vWzH5aA7^Bb|ma4ku3oS(6STAV1)y zZeUAI0J0TA%Z1p))$B4FL+_=-ndH_ z;$RQI%1|oMwx5EdG&NV$T$_$e^k zzVFFhM+R>p_v0lBxqOeNq(!^@-D6kU3-d8GQEcW{;sofZWjZIFY@_vkzoNQ#OE@nY zje1aR^l` zA!y`lcQoFWP|#&gfJ9lkiXVPGbJYeX?f!~j;EH|OOqRygl^xw$^>o4Hni?DOd>s`v zmN{RpquM?k3w+AG+oR6P28lXl)*>HTb(jU`${ zAf>cCe5%WyPG~RK?MSw$W;{{XdltLLhU?vGI7Dp7#CRcF*vp$Lr~;y)wdvlCYr3)T z1$kMUK>n8>tS@yoaXL?I%UC*Ed!}g`0Iisg=i3()nmCR6m76}shGu;6e06_sd_78) zKGN|Umz>4WO+$4ojEjp zeYweosnXy>2_gz18P8gzMLTgLPUz#w z+vVpsVMSGhc-1f1jlx$29v`YJ;i(Avzl4;dhEimq#uXch~Ch|zn9wWSsMCbG)*4-m@%hxu1+;s-qH z2gr0CvHMAGVRq|)Q+GF>9NoQ*+Rr8ecr=leUL%#iYJOaS1+#IiV@p)wyeKkP^H_j} z(ii`Y7k!Q{-XD@t{{Q zvgA-0iAPn47Le-wC(Q=I5Vvzt=+fx<@!9 zGWS$Jq%(7D7Ulyr37cmi!SjU>Yw`&zA2RFAY&r4B2}QUTR|y;C?5jawne0fPgSU?& z3mXu$ew;+=L65Do%LC@d<7I^t9&C>{Gb&EeRaM7<=EJGnTssP39En!$UC_ zHH?f9ZzCS4Jg^G4L1`!KBmt^1IQ5YdVSW|CRO8@)-U2PN$PU)PB)@#neRqbxV&Ww2 z%Rz->_YsVcl1SaK_A52-X|%jT3ZFaK7xbg{hIiGnzYIUQzo9DqVH&>+{eU%ZnF%x# zM`;5R^Oh>-z3wY48X9-Iika$Xok0|D#LdAGosy?H73%~)sMpSTdNV&6{w(_Zt^GkC z2GI%>N`^b&@@D7?6g;S3ks&L9ju0QUu6> zM-D!BZq_jmaJ%%8**xrWgBrDIwI?s_Xs%atFU^XDAt-pF)Unk5aMSdhY!eMy+J|!j z->lt$5}irW>H_?A29S4)1MaPsD_zx(+mcP_LZ0R7i}j?WnbEiY#aaKi)@M0OU^y}( z(9Uf&7lt5C!!M^20)T99bYv#!L-vE!bM0?TWfJW*Zt8W3^||n$Q_3@-W}9N7oLv3~ z7Cf!c{<%TOuA*9#@y5GniI4Lgq#_n-ke>DzuScK93pPXYL4J4ON~9MkCztUq%*D~r zA0S&_HN)Z3*LFwe(mt-LJ7=p3`FNm!JqWN996(Gx2MC9rfUE)|CJ32(kLyPzuGe{c}G!_YH`hPzN%W5_YW0F1pW-L#4}doMzVL;gZ_@#%E09p+nf zr9vqbG+IDRTpO=Iioc2+H%Gs~eq0q_t>YP9VmVw)XLnY&n_#vKrmyY{w-()0w|gYI zkxDuJj~3zYAO&%v63m{D+8K}-x%PrPw7|zGL6%bQVZlo9S@?>_^JA0L;an$;BnqL>7-`J>wtduwD3>DZp-}pc5eR({TZQK8dlE|d2 zSw@y*mk5O+*^*Ep>r@B{MYfC?k$nrHY@?*fo;^#(z9iWRF=iC9&RE7UOTVky>XrZ$ijfBqQq)uA+FUy~EUUU}E;;RD#k-^q{QSkYRyX6n zYB`^?jKz&q^oKJ!3iLDE{A;-1ul-XX)xbb>?ttuEfHgrWN0j?$906Jo*FP|3)4yXZ z{sLb7?c4m91g!*0+mSosjX+D>GNc)RzdM&u3u*KS@+_dLC<<30K3~L~>cKdJ8ORbH z$cnX4F{09vmoBF{T&;!DT+voq`=ciY_lAq{^}UYQWrPN{rNF z-eup(6K~UTFj|di$YuWn!7NKD;+#_#`l9&eq2t9#Nia zVV2-@Jn`j8F4vN4hIxywll~5^77rdBtwKjscF7mJ>&Eh&oX{(8zr%iBYR1W-?<_+{ zpvB>K%S=Hv=-{&dq9>xT4ix)QwIy6E|Wb!>q~-R2_x z7|$cbO7*3Qoz$vlKu4xdJm#SsV!uDTn~@3oVj_02Q7q;nZ>S|Zi#*3LPp;vb7;;w z18{wG_e*kwe))*ZEh)z;Ai0e9dU@!As_;y(zM!s@@*-}oz$`g74QSQyfYbSerc1rz zu!6Sd9vkfg`fM(u1_N5=rV5-4O`vVE0Rq-2m z6wAGEaSm~5?UcE*-_c23@v(t0LA&75*Gm`f`np{m{T8Rf<5SxW(8ivUHNJz+=Qd-> zB0t0qr2FaPA216dNU{Lt19r(AP`{up5A*{qYM_L7+ay#toE5hsLU^9A!hOb(dwp5; z{i-fAGh-~N$Zbyb!_@%kktb=+$H(wpSr4kX(8tFzP8brrah!un-q zU9cFOzrF9(E7$m9=1Y05Z{H0iyHZOeZ%P|`y>~CQ@KJ8M`TxNIVz+DoS{TFdf`M&y z`VZkoxGa6Efi)&g(dxmHoYc9|rM>5-WE{&etK$fSH^02OFv#cXZ<@KEf5^}T_aJ{G z43Po|g4i~bXiedSi$k&)KQ;&Oy+AdlqC@(b-J;it{UEF8*J2>J_YC$C4B$x2Jf|NY zYLo|5cX_74WZ4^v)BYwn>J=sL+e=g)zaoW6s|N!6q&j={zibE=Dx;Xu+qdnZqnmy4 zu!_zoaIoe{|I1S%L-H#hdyFqki%+DcNrpW4jeA&~Sh?)u)X~q1EH7Qj^(6XTt0&76 zQLSho57mJL&j}i}ZX@dxOr;cu##1F8r0kHKBCpgMq5HHTX8)?W|M_zON`;=$`yrJk zEo0xqA$6fjm1b{oE zz@d$&T>4+%QMHjj)D&4D>tWug)y@HArDOLL#&h*EzMwsU-dQ;^JG_lyqYJxu0oo>j z=2~-v9if(}nd2Gx*haJuzrBgq;H9qotc|k@rmu5OsPani0b2h0 z|2Oj6}9EI;6KBU@LMQR%SUW z^?Cu+#CdwU$l>)NV8tE0Z^&~evPffp*YnfW48y-%Q@`{;{$%b~cjF;C-~{fcGm~dK zA%_|mUfNB%K@WKvlS;+5-4&S?4JEH#B_8ETnH1P!R0V;+R{#8UGPWUa%TM&-`j%^P zE}ji~PNQ(MLX<=+o#)Nj(kjF5iRV}MT-IpY_c(On`O`3AA%21m$(Wo#?CVTL0fEp; zm&mYkC2c#b0c3f`Gvc|JBHs(K!k(6_xXeRrVJKE!=E)h5!U+Z!#wN_|Qa}@_2~#9UFniCJaIMoF<6R*g5CiO`XA|hgXn=( zzsnR7B8Lins)TUfO@~F{K7N{VY|>4sV+^ynlh4t0T7h}uj~K{*_)3MCDdzM~LQJ%$ zn>I!)?#GGH1fTU|gtbSNc>8s_vwP%+0#SDog|;*g@%b_k@NOG45ftNRfdJFCBg zsw7s*0>F{*faAye)g^;>VPW`>3&Z+J#ZttFT^2puNhVoZf1!_PdeZ<7lmqMleIS*D zP$aAt%ZF-8lD+Q$1-8D~)@t}f_Ak+1Pn}~l_Md&6WEt|lNdUS$NpU2<8tY`GC?Gx^ zy=Z@kT;W^|lqO%#e?@+ADp84p#) z)2>$E{*QNAdK8%J{%?5ka4TX2{c;)sOmFK2P^RkLl64K6ctEDf1v#PL#yZ;3g&X0c5t` z!R5yT+U@u&ff=kw%fUFq!XcHN!6oVp#Iw3Q3Ig2@NcO`U#wgw+HeK2iQ;O{75p4T` zE-P@dY$Sb5w1n-;Rl(IM6kU)q%CmBWzX&3$z%QU==pn)L-R7N0P4=y zDCi#z2KqqEM__Q8spt2=GVJzhs7)u;l%B+l}a z9^`2kzdPtXv>6OTmuW9U)J;7$nzuPSZl*ZwX5{!54)taoPkM!Es)f&(3$sRCjU{y;FRI6FvrO3J<_x!~{A^2Azw!K9R~d>k(U|s+rw(D1eh8|BrYo9G`7t+T9>0aT?OA<;$%yH*mnW64js=R zy(GI+Ed6!NiKVU5c3#t+1|yDx&c@aQ!ig`h4QefwO8+j#*c+amX477F|UUJi3AhH8hdHus-%36q8GV061t4Q0zW)7(m%~i;-WD-r!&U z9OTnZ0*$c`sIp1hVIFYfes`KVYAKx?~*|3abLQm?Q`(5IY~zg>I+SoI$lN;cgVN ztQn3XH59vN6CN%X4)ZIt;O(`##!B=lB%Ga^ofp^_dIWF z>uOURqW3@McJ{lu=QSwFun8zk+e_^0D2;^>Od(v0(7jNnz@@$eRMDaN(+D|oJh9Rf zf$4O+sd|&oL`z3OG$4*$475v`F=>(yhgGBn7bAq=a{ZTES_BNey^jkm(&0DIZ<9gw zsHPJwugw8aO!sF_03#IEuLR|CNt@;;$|buNxLQ8}y9YS=9q3^rDzu8`Ix2{k<$xc; zjR{M$%+w>EZE4uUjr*(??N!MPuWk7;9Aw!%S5I{f6voG;H_1%vE2a*Vrh4QA^8|{y zQIUFvyfI@0iNvs}>+UPf#U6ZF+9O+;bFjac4{)%&6w$*Rj{)KBKO~Li^`nU=4p_4{ zO@LBT6mTdmCsZ2&=U2;F60#zwFyWM6;`$EC@7Jvdeup#hyZhrab9M~6k;R%bl@d|uA+60;iH_|hlK(6&$MZ`DhRZy)}e^tHm?%V4&1 zI7LPs&FqYxi|r%d!aRg-Ufe`d0Ll`%fSOBl>_yH~6Eb({5<8!et;$pu;AnTw1^Up* z|3$^R{1#>d@H40;8*cz4-i>?&fSdaCzk?cK0D<(UZ-*|CQS*Rr#5M+`?4z<1zJuTZ zFAEUX?oJ1EsCSshh|rI%biUFV6WS3C2z>|nL!diZ8K{_A2bbcJSrODzuqM>*@dr@a zI^a_PO5eu;4+Z)K0t2e_&w_t3X6Wd}F*>qK6B9-Eg%Em?dyosy7QTbFfe;0d$S!(T z3UI$32O*D8m!Y_HIulSzd}`o3Xf5rpYU5U?U^_qjw0}?cher9o-|nBhm{q@nVxb_! z!3F4n2Ag6)9u-_!Z?hyQv=|sLYcVAd!zb06ofExT9{U}nSelS~A@3IcOR~MCN~^=Y zBgOmQU+T?W#e8a+IBPO)g1ckJeXK#*RkC|^=`Fp;jdW@e?D8|a{ReHD8jRWh3eGQL z3w8K5f`PCWu0*r6cz2i_9V9Y3Ege%>_I7UN1w*#>yS~1Ub_qm*;NQuz@=l*5riCGd z$PQ*iRNNLtmgoH74HcfWv7|_R7<0acp(=x(p%{oo*p#_hWumXcge*CVIURvB^ht<7 zR#>3!57;l}!Ly9om*YYOv^HZzdDV0mDWCWEGp;6tV-Kpc5+a)G{DL(F{5^(r`YYd( zY$hh<_93ID5&}&=9(M9ql#dMg@BcUgueZ@r>!o zSLA4rA2jXIF$-9OFPmf5z!QWRh=d(?CRLLIC}!lFfyFFN{Zy-5#C)43H|DHoWq0$$ z_+CR2r+i8xNjcW!lan0_ zO`7Wjm4|Q$VW=hD?;&+BZb=q=Zt%t->uVGG)v9B)Tel>{pGDmgOEMJe-}h^dd~Cs@ z*|6f9W+q3DZ(5K8)El%^O$o$@Mzuy-9#Can?_>sDYp^3sGB2NL1Eo4wMh127yB0ad zbQoUpN&>XYPja9YAovi&jR!qB{EvTIu8M=+v7V1ul28lOqomnoo@M$XZ#6Dc<^484RzDNuw zTw5sPl{Ga{Php#<;_-!MrhpGgr_Zp1_C75LX=0p3GE>EVL;!ma5%sc&fuy9eMgp?= zEKB=*m?q?9ufXnGh}Y^?IKLZX^Gu4E9*50`$Bb*mqWXhyXr8r;HFt&^+I0dh>F(<4 zcDS95L-=n}gQk1m|Mq-#^htTNnAL_r{57bH{P<)WV-xIXrMj_azD=TFNhK%pb%W`F zGADk&$J($Yt)94G9;6d&^4u7nRO&;3&B{mg0m=bnq{3phVEsPTwP?AEB&Vq9JVekl zG0D$)L-RRce{voXe4cDXz_wwJ!1c;CGjKt2^H<--i>P;7oaos6(ZP}70| z{6BWOqJL>;<6(jvnzWP|qcVxVRG93LY

pq&Jg!(lj9ahf zY%BI)(Qyhaw2`kUKE#~Pnn8~phpmD7k{=2R z<9UZcucP1okb@Keo!Il>AG{+lDU%lKvtrXBN=l(U*=i%`cW5Z^-MSXt>3~;SBupLK zQSW5r|KyHf%6h5Ao=dt+tGF+paK#?3Oe96p_mVNV-u|UV&c=gB>T`BpRbf9QCT;-4 z6(K*labCtxnu=6&0v2e!OBW%R;zzPJOAk))=g$T+ioGm<~-jbt>0uqs9}IZdBP zkP{4XZwUtp-OUmD_8;ctSIGQtEr;LE0soICxBe^h`cta`_R+G`l_?er_E2(O07ct> zOn;Or|JqH&glTg8CUu08?8%qz zo3LOz0azQ{!C~nB;>F|M8_ZLwI}z^BCWR22j|3F-IT>~YDFTR5gfQ(1Lg6#qkh1q9 z2EAC#bK3f2x(B?vaayHqPVZpVsE?kU_SuV}WS}5%iFBWqp>9br^-s{Jy+OLsV8%qT znnc;P5GucuVqdNcDON{cv>?^cQI+|IY}dYXy>gMbrNBH(tfLef$%KmQ#0)xcF`>f`6|D@kF7Y9MMi?G45qm5>ZSK7w(O zkL9+$LClUSY(54)=5%ba-^D{Of9iT0yJ?0Y|`3I zX+92R_iQlH{nHuc4{PhL4{~Rix+!Is?xM@PtKPB!?HwzBdxihqBp_VKW@G4`Dq1F; z57|;rIGz8MszWTU=)6tT6dSDEc#=w(P4WFI@@>QOF48ZAVaJWaN&jTMz5I~KY)v$8 zj644B<#JFcp50vlzEYPd4wYEkDg_YQCgb-`e9+l zl-792rDox0v8?{Sqvj|*0<92E)&msDjzMSrqEh(>_G=9qpdH=Et-pzTPIa*yxaNsrb3m~&0j=6&+ zp96ZW+%_pC`jj_?W>ayrIvf3Z5`;!ONqTaI)CMjI31wq~@{m!6uePkZn7%M3# zt?TnEo`1RL3tJcyF?ldp-H4<|{zg=p-40gvE46j7H=(mb6AsITCv6JIE)kXG3obP;-9zJBP(T zmq%F#^?XFVf%H#?bZrk;Kl8Gk1||hqm-bA!U8kbW96|!c_#_G%cO( zE7yqsbtLxBgHi%=e2kPrgmgk7@GFE{H3V-f!W{_v*7H1T81~}F^cQK*R6q_|@k~-* z&0jLyGRb&;{Wn=UvzjO2)&$QcZ1}`lySxyLmZ9J89=pD`^O<@?E z?mNm!gLQ>H4_Voc%suuE1~+3MWX4X+LAbFG&NE7rYh?4F$39u*60`^Lf|toh6>ye` zm}-}Xwi3g~5N##_j{bh;RV4L@DCq-$>qV1qBl!I{%nws_2zKRS>lb#CEr!AmpKT0q zGhxp8UPd$H_2{foQ6bRY-nh^AflvvAttbPz3VXM*5-b*3i9!l~#+Z zpM;`SPPXI=GwysO_Yng-V1o4HZWu20fIw25ja}_2#e7Y>2A!|b`yYN2aN*+gY3deB zGGOLsI$~!mSXC)6Gx)|}R#)a@+YF#+kUS9#wSdlKK=wdqa)kDz_n6gIW1 zNsfQXcu8KJg_?t>bFEXA!^dk4R$?cuwAw+4SHBYy0!QNtkuXUbC5PeoWRSN!A>Vwh zr6(#MWJlb_+!$H&-hSlw@#}>|Dr1*wm+W!ivA1CO{G~gf;07HZ98AdH`D(v%hF%-V zI;F5#(fPU5%we$U1CFla!6$L)ZUIU7Y0KzqPtTk=X<@-E9HPb~w8=mLm`6;rw6Yl* zn$8IZlj}`&h)0?Uo$f`2hahG(lrq@swR(QXPD2J&(}fd^OaCCG06^;f^V$3^VzqA* z9iqkLRy!H4vL%`10B;4w2zBv%gsO}P_iTU>0^3%Kzdi7wd`nZ*ctR>lE)bn{?26Sk z2Wy%1mMqcE8du(xW6kix_7>%9#4KDCN5#?I9dY7shFCW~aXOIi9g%IQed zT%2ToGaf5`d{!<^Gn_!LN0T$OhZ+p?4Emf3taSrDJ}6xD687m@zN@DT_Z4|W7|A<_ zZs#7_y43kh^^Diu)P#bS^}hj#B2!l9`$1;gwm#AwiU7L4RmTQC6vFo-j zrN{4B#eaBo04xn0y0h;Ql(!1Yai*Mydr{=x&9XwsmYUodVA9mBeU$seY{K{#wOJ;< zi?_AAWVIj#u!@9b4$2usucT+bzh#SCv48dv@c}QZK~XDup+&p}lm{++^5!MghA1 zH1qispIxyRr#(|Qr{#k_pa^kp6*7LT^xk*Vd!RX1=dQCmIsnig(AR)GP7#E=4>lOz z%emdiNkoQwoCh*$!zW7zjUDb>ZTF{jTzAgXhJmbQ*@~(`svbS z5%~Dx8$5m_Q>mKAt!;F0HaGWP=8Y$X;FO2LLcgPl*8M7cn&<7?6p?6gxz%MZbFFT% zY!D+LS$q%}ye{SrD|R9mkM}of8uRX(fgG@XmmS3>oLFF&Ty8X&n>vj!c=$y^m{Bwu zG(pS2%shcrf;(X4giyU!_S=&e{F z&D_hRV$Vpp<2QXww=G*_^z_tinR_8-U92FD{ZXbIV;U}uSeJje0ROU_*qLXUn>-&u zg{YbU2H@HB02W!ysSsimN&0M^Jn}qif0F2G>?thJ+59BN6+#v@iK&EC`KOUwiBPJT zuhBE0JMXBIf#+kSFFz_(@?B_T7vmbxAR|VT&wptavg|;dG$uM`Lq^m!@G$TtfM%~h zJd^Zk+WKT$^?D)O(=TSNpu4_x>jf^1yIf>tqIWY!5OLH6#z|!kB4vr%RUZ+lR__U5 z?h!luaN9AiRh0diJqxcGW7R*}aYq-DMD+oTfk`p(vHtTMc1s+lr7s-Ca^D68rpglR z<8WDui|aPyE0y(5;c9aH2UeROWJ;C3G0g=8SkL_9bLx*1JO8(cIfK)Xt`MNly(sAs z*#TdCX^=EGW>QrjAW{+2`|ROojMNv$uMvo6rS-TEGQ@FBdGNmEogr zJ|*Ct+c8}5AVP6xt9%&jK!a(hjmvEPh;od!o2Tc2t|Ei$9~xE!4OBlesG?JyRN)ba z%ErS*gubru)>p51$}jnvLgGOxIzuutD{@s# z4cvj$$F)7xTI^Rt&}wX2^Gt9OM4{b8DOH7)-L<a6_Id_TVFa&%j-HMOQ@+8b7hX~KjE z9Ux0eM8PHVCS7+^Dke%Y?g`G;!nfa>lYU z1J73B-!SnkzV6cVxr}8GY=uW}J%u9^$&I+Z-B{L!v6;K-u1ET^E!#7(-H0WO_Ak$O zQjFFq?&Z+*192zF*|XBT@RNw8?$)#Wjy-+EpZ9o0t1wLhD4O^AE3U{4pbWG6E!Yt2!~V&5bR1x> z!Ph>_sUB=SWpuZd@7i;%!3%oY$2eyMvL#int3&8QffONRKcxUU6TSiHP7K@ylLOY4 z0XnA6YCfPa!QV$$(QSg_<}qdSG;kf7zTXo{p5g%kdVtyVPSmF6BB06E0C0To+cN(8 zx75=B=of=vrdpGO}pC??~ZeCltag~4WiD1UFeko7k zRq2qKdq9^2pzl0cJYjh-bvNzN)JlGhji%Mh4`3I4^1P9b#e^AyftVE(1oB8C_x99PRGJqW~%x&32=3uZ(jvY#Df5< zh5yBj6p$Q*I#l~ukRRsfNbJqfde^kAnxMq6328+|L%;wV$U~los!a6A!XG?tF;0q6 zQ*^k?ASoiC?!my68{EXvsU>O#1cT03Fy;ujNF|XY0nR4%gQ|<^sYALm&W|!vcVo z9@J5uW$Dy{AB^jWY4)d-Bs=*_wGB0FVBK`p51w0kqSNSf3PU0=33XaL;Qb+o0ov(b zr%}KBV{U)^V`d(rB?$mDLhi#o2}}N#_=X<5vFdz}QQyI7KH-}R_Iws6#boJ&il_&2 z6owg8#e4@KGiXE7C!>O1!@0@C;@e0z>eVJUd+sqMqoI4~k2yP*zTQPPj?!7N@dX#* zX^Dw@d4hrgSz)07iXg!G{SE?F&QHt9?-Q^r{-1r!zKxS#K~t3BcgBvi&>6{TZgN#8 zMGk82PPj>yoQxMaA{E7ts(Vz;pxTuC>`qH(&B$LIKS(apz>oNeu&JCV7awb1> z+qis{6>smhb<}0L@iLi@PwFLG?5t8|k}qQ`;2h7R%CI!>E}(AvWBE%rHbDp*eV^9o z5i}DZ`cy$tJ!1ZQpZQ}}^eNud1LSWrHwZC66fWQahdd_-S8ARZ1@13|N#9t8w7a_P zkKJ?YtKHPF)E-dK7fHWQY<=UC_%-U`A=0f7{!#|}MV@To^OF*hRYCh!m(09l1>m@^9--yj+e?2RDqDi4o z_=>Sc@PcdbZy;zo0RhfvtG?H_c}yoJCOU>p)CZa3PYgR{JtY)?#FCr#JUpFLuX&pY zp>jr$poHmC%{cjX?qTujn@Ps6zWG_6raTl5OK}(%o;~}x{IMPj`i8f!o-fX5rli3$v%L~WQKR~dLU95>m7CD$yp?d}1NSe7$89fy z#CD7nOEZF5jXg8o<3a>Cb${t)u*(nqkZmTjQ?-~mM-mwuiXdyVe0rGQ;EN9Yf55j#`i zJru`DTb%;f03&oirZds72m{Hoxb+>o0|+ZTiy{5m;s7W-%d6+%DYuV>T6Wcqp0&v(LPT()x3 zz|l}m+;UFlbq+(U$9#@vy#5mh1-?fot@4aCu3ky`oMG%2G~RfG)gsR43Wv1s=t{L8 z@~DsRAjV;7KR!sMx1qnX@P^>G1esz18)_c%ov>|U!c&L4UkcOhknArkkYD`wujhpq0u|G2^?}ly|woW)BJnhlU#R-QLoUb(2VAr?v|Hg zzfjLoOyO*y=fkmho-p=MypchhbvQ~cG)clNVlr&zNOE|yAh`GeZ#-|WlX(yGmF$f1 zjm>Xk>$R2ZRGpM>XFGFiS7B^a#*g30bMuZ;JS!X6X* z)xL1RWS`wAm?0`Yx4wPZV&rUbsoM_1ImTk5q@stu&g&dKLm(Hj0l-F5J(J8X_f<-SbLw=5}f`=(DYPX83kQw_;m z1^yAFH?*26fHAyul-mEz_h>IE3cVtpG-OhyxaX$c!eGF~d0MTZI-6xm#vEk)nk0V3 zx7cZM8FLE71zQ}>va#JdPkTgX71`^ycBGbZZ~Cqb3L&xwa~kgbtfo?Pp6(nB{=M8_!4)pTC1XK&yyCFhA-c0FCD&O9S**gg8*+Lk3v7 zBao$!&Jz#-D^sA4wFLxW2|9FM%b8LtjSP@l8D|0S^(wMd4doKlM{T9RIKFNpo5AE8 zun^3~HXcR=0~CumGkU{Z(DQBR26zRopEd=UXz2f*k#Hm@{VF z(PR5$t|SQFa-He`e*vBz_r1S7UkNS>hkr&JwxVyPin*oU&cfy$#150zI`PD4z4$$+ zZ}roXyMho0=Ojjh9O}rfFD&*NRaV{FV$R59>BH^;uAnN_;Z?*3%YZk(VcOxR0Fsb1 zxdJcGNagBp04R3jMV;|JEIGqYk+_%THM-8B8d`oyGP4DRlDNo|5&M5MFlvx z&uH@cZ{Q>%=)!P)A~^EaaD)D>^jO~s_cl{)Po~TcjMANtU+)@r@u@P#qFUY-w#)R$ z^-9fn?cOzg_1RNw(%rMdP`+ON!KH(#o-&f zQ0h?~Am9MVsgdv6AU%MG$bh6iM*jPkfai2-4moK;I!mZo9>^gF;w@VthnGSoX3dQ? zua%Np0-mSxIh>aQh3$R`4FTWSm5=xw5QERuwPn}jxEvdh9PO%nKi##aq|oy% zuJd|scyuP;ZU99DZZguSi0Ex(BuAC@%`c|MXZFL4YxcRtxW24TI;UZUyQMlSbFV{s z>)P~0wu45<>pj6xPGvz+ds;2heI-zzXc;M2q|}TII?UN>=#*zwFJpQhEhP|FtJ*Q| zC%g%jZinm8i=B>qxFO`~@*;aI5Y{b^dB1(;?yzQ=`4x}O5ktTxG|@6f_EhK1>I+{l z8^(2SoShR z5=^k1{^%K%mHd1v2xT3Z#c%m;M2UZ4{rut2)?%f52VY$h*xAec;)C|L&P`3r_;}Zl z+=P(weBSp{C8eIp$Y;2#5fvGs3w>jCo3SmwPWdrzEYS~a6CK(#d4a%bE$jhz4yig@ zzP6!$go>70Z+82#bGQ0TN8f4u*{7D(UPEQF-pWCkLWm^%G%cxttB5RbIM!fZusDca zOii>`wEmFTez@!T2JV-C9@^zu+pWSg9N^?Gd16Kq17uUf}FfCm+Y=<27B-1TR)FqxR%FnCK; z8%3Y-c3tuS4ZYWYqcYx(#zVBMh5*gn*V^JaN2k+eW%}zAM*1505w)R(SMNrjs&n zC2ic>=u!_|;8-a7(gky=j-HD8v!3H?xJERg5NDr)llAcbGHjMtTgOrTCEie=x?O`5 z!XO%=Em|A^*M8cn+}X!r7+5F1kP65)qWJA@N~?7`^naO2j zZzz4-YwCq{;;$m|KQ~*=Px1r+O*dJi4h-ipK%`vd#kWRC^{BZzm+YfAsp&oUCo8_z zpEM5n9G>R3>dfS83J&;dj*gMYjDt~8$Vh;Now-GMG{cEkUm+2hRUiGkAn2;5FWY zez7B4eh0zoYUnZ5MOY$FE{(Z=mxp8vG{U<_b_MdojadLKj2jd0-%7v0=1ha5=ui3z zTE2t!2t{rk*lErAB}HV`lPr}$%>cxJmc0Onb&!98=G@%3BLqB}Qbpizpxdw;m1}AD z6hI%C$$+jqA=xpj^4~#dxA*iXn(*CQz!s*u(;}c>$PI?{)%x$C+@F-B&LqmXj-=!w5KP{c@ z8$p$gY@$_v_1(Q71Tg7lCqUBLg=`0O06bbsj|G5zBP2WYOSV6Z-c$|UJtn<%k%v+Y zC6snUTB(3gj+Ia z`OeA83=z8@EiU4}kbs6_&~kYoJ4Crgg^s`j9cBb!Es5VA3{ODEw^*36@5QTru6t_^ zSs!wf)3ms(=)(AU;a@h3zy8Ai;2syh9_%k)&EJ;opV~zehpIrYxXxfF^xKgfK%a6H z>(Z|7SZk2g{ra(C+*O;TK@(ob3BxOO6Bl!@2U`vYGpyeKYq}31Oak@Uwvg!k_1DK5xf8cWeTmiv_^OS6A>{}O0xMG& zthm^Kv)r34cJo?n5{Y3uHC5*y~vPvNWDOShUrYdSc)5?wF3;;l>pdaL?7y6KZ6z| z4)bQ7isFfaPK_ZqgAjIjG&Nb8dY5LRFZ&(jzjJC!2=lb>#jobXxV|w1R3zsC{FzT{ z#mI%ibb1#Kydyz?ZLLC*G_P<`19mG&XBwZ(|LUut9B>bcEQ6uOL0EyNfB}&@bLGz!Ry0ZdTv-8T5btOS7ehn82<} z_y^Kk0HgdO4&KR$+=D7zK#a|xsIf%o)(Y~Q0Zo<`2K_S7$P3&9007(!VLY;Z3w2O% zFa6GTbh7E+=pIK`6LSqk_nI?R90_L5&tcccMF7t&9ClRcEb1^-nppA-@2>cfxVf`> zM{GLrv#^Du8fe}vUoi5zo^}rVZv+kh%5M70%KP8qLjC94%`IL^hlW=cX8FN}i#KVS zy()d)_k;~*ZD$-&m$evcg%6B|9eA1^ox)pdxSVMEsK8K9YMB_AFnMZb(^D+f)IZs@ zEHY7?R{aF)Y~p2G2&T+6rmG7+Q+kr=ATiJb1J2853o<1b8;ObeV)EGz1Rs(9&=Zf` zzc_IFsL#i97lQNIRHfcGDZW|VzppWnFQhcMXyK-4k2nqf@n|Jl&odxP z0s1L_=~=Fl=K(X_^7a~v2D7M@({!`XGl}t+5Z_9rmAcEU)N=)lOQk11QUu7cvKOT< zpszsjuoY&)EbbPXJ+x%n-hSzbaN_+bMeGU>gNbqLv8&lRhgk$Ri_51W9dnO;^2Kr* zhqDr|DwyA6O+O>IJVt%Hht* zH`kwuN5v=Hsxq0oB+6>ZHdbj9OU@z|;(%HW)nX)0eniZsaG93gF)FIumUHi|@IvjZ zSg;A_`iC^W43#bag&NQEjW;4%M%u86E_F{D`2gu-_C~4OM;6lee*KzVdMDd4-6PMF z6d3oei|<(3XGt;V^fAS*leM;7*4?VmDjpx&gA~|aOZM8ln)?rQ{n5gJXYeN@-6;g; zHgLGT(r|+|+Mls~T5!~J^DMS}>5Z%3<6hakG3jcYgIFPbC2ekvAB_8(9QMDDgnu6i zf9yyQ&ED^EJ!t~3oFr+Qo68n{SwUOqkts6+!DxoL1vWm`s*-((tbtSNzh6(;QI>w% zIHJ*Gc($PAV)EI>JuiUl9>b7W62rnzuE?)~kYD=YzdY*y{tN$eOJ@WiohJNlK(hzV zCAtz}*XH!|SDmcq5FR#*oIvA|5$h>?w*Szch_N-i69PeE*Rb;#9O0YG`qw~PqI&NXICEcMM{Y{mWSb+(I5%dXwwpf|w>~Yz>TCDd=)foAvnzCeiB3ek8%0 z?@x;Yaf(OydHkJ#wSn{(RiGCk33eeG=^pQ zO6Ul~#T}+?F343GIA#ZqCo^Troy)h-847Mqofjm>R!DWb4u#+7J5yoXtNStuIYg=B zHvvbq=t;f%oZgtlM3yE>&vwQ&iY_%CCZ=>gS~*6Dd*57WqwS`UeHX74$Lb+nbAtU6 z`~KPVQ$;nB&DZns3h&)VEMwfQtjFrAcEjbukEWdXm_=K`%%nAF6TR{1c)G|ElwFhQ zTiszDT#;^(TYCN3b7{u+Wick&~SIW?MA#|-dm~hd(S|oZ`DNC?rU6wC&*dZ$@K7CHoHXawgNgE zo2^vWV7dl*&#;08bS(x)=?}i$kV8np1Ib3S(1WfR-j#-v1q(tF@9Z=@?>i-0k2+u9 zXB0N$@$o?BYgIK~v_Pq3@0$_l)Y^_b<-iLQdEbzIh~PKmJH~2fcrM3RI|mSlX2r)^ zC>o;rgwe>lu^4q@Z<2;cbH;6+89oT_j*cRmqqBwA5|IsVlxy&lgEpOrlY?-cQWxKPr{N-{Rz=%M9qPrRCU#YzY)MVE zN$kN}I1hNvM>9V>#lV;ptNoD1bcQ)?Uj9Z}vb-un02@-@9jW93=Y zfe1V@9=nGw;JTg8wHBrH8dDxRp_$;ozHI?}(juZAHXkd%@cE&gzio{22j`<@q40?f z74j6oo=U&cjCZRN%YJ9oL+_*?B7(ykj+ET48;z2S@x%+>veP#ia=IM1x7Pf?g=|aJ zCg$?c{^(Ua)k)xzRMwfx^LL|T()?(MTjN>RhV!|20;Nv}q%BWT&QVX0{cs6~tjT?C zmnUvpcSd<^$;-+(r`@?!7ABs!f3dj3d&Pd9gFwIW)DkVDB+uJ&a!S^mQc)ooQfz3IjXB|ShBK(OqlM|QSit*bKjyHHV5HwzfORzI1?!CN6(4p6T^x-((((0 z&AhJOuqqoVCjiOOi>a}7o`S8P(z%pL#uqdJL@XQ`c;Mhj^VLU+*Yulpdt_te_|gU%4FMkKt5Q7&l@Q$WX`$@wfm)-FQ}nHE;gTsCE>-;% zyEn9aU0P7?2qA#sj}6*XzD+ zb{OL$MIdVYmv*KLVI@z*11@37%6$9Uvng@VYuSMdjo(Hky%X2e9(&od=w4YovSYQ^ z8p3a12Z8owe`Z{Oo`)#dXI;bt@jnpZpYts+{%oCnyLsHDx)1c~oyF>5=gpG>=X|In zP};7gC5zgr3_zS1k;wCG%oO9C-v-(w<@s~8&}@D{Uo%crsh_xHbLX~8!TH`m z>TxdU4aZ%6S1pH^i9k$UiF5!K)AWs{lDrnvG0t>%$1#HRQ*#qwk<`INBV!pmLm&eh^PM4W5+F z>f5HlIw2mz02wO=)>4n?jNccO+yvvup%rclX|rJTH2l<$lB)@jd-QvAiA&!>tkBWoi7GhyBB7`K0OkvCbHV``nQXi*kVqLINc{GJ zF-bww0F2~*WUj_bcN?7qR{KS9f2w}6B_(OJmcvYX)K8mts@Dwyd-O4=oxp-&+N3&h zxrx!!9xkXJl3>39JLf=W#W{5)uI7EEr^xnut3nDcng+*xp+>TON*;Nh?cP+x#EU%% zet%Y-f-m^gvg=>}mR0*D0|fyP3n5iFehf`)Gld-o1dY$FwV#Vswy$P>V_xO$L;h}e zVTeO|cZIDSrS>*8QTHftVZ|4@(E~gbHCAtKFsy;5Z9B5TyRu*n!djsT5iF9^adYva zgbO{m7s?g(@nVQ{xBOfXRmUvTk}qz!R8Az>`fP0Pz-0d6s!drNXk@iO0caWksEqv}ZQ0Z>AB3y#feg_Fb(Gs3;d(;V{4vP8)t)#(|$SVOWkNC0e z^K=;S4)9;Ziui(vEdr1`iby8(L-*)0OoX#w93V?iwcFu+G&m}HaG|58B*3B+|r` zwcj=M7qD4tNp}QQQcUah+rBpd`nBhSa*EcG?k!I(4H_@jG(nt4SS0p>xu!rOWQnC|ge zUqH2y&;Uoeoo^ocErLKAdZJA;gKq(_!2I~<@D=`fQ5-!K3#hMi*-yWNQo&eOnoW7A z&mjQ9EjR#f_^RM30(}Q?R8;;4uHmLX@N`%4FE&f8v!>v5v+7amii|1Mxsc$-C*Vj1 zPaMxYe~+^I86S6GPW6{8TSXCs8=ycpNKo%5ZJh1eMbP_dX59CX{nFH+lOmhwUKvm-qP6K{ljJA2*&sWJb%ci^c2C{`R)Yc_ z>kTjV<=EbW_!BNMo&Q-|fzd!*MFw9!1aiE zBYx1M1kHmah*zyd^6n^ zKy;d?k80-?R|vyoGRGY8Y;#~^<-W0?%JF*PszmhPMD5gtI*z4rN4qWV+DBFoOsqPw zJafq9st0;0P9hlOo63^CRKhCYGom8 z`pvZ~JJ}gBgJgWnGRvntOupfuW^10Vz3ysK{4=xoROEPTdjm87$>kBv(&EAs0VOB0 zPI%OKO%PY}#C&by6v=3rEuE%E`J(Q|Lipl2@&%A#rv5jfN-};$#l~j+;*{YP!~4^1 z!%k*_l)w><#%Q!5XbD_c@j0rl2vM9u!8(<2ukRB}$?6YD0;^1k)06sL?-N0sSCL6I z575jpj7)SHg#w9jnW;WelZ~q-+q(jY@|OZ$SEY0v3tV1oz)2vqju22IhAA3nUpl=; z5X3n3nmdS1b?VNZYpm&Dyv}{9E0K&ynN%#uMRs7-YC(f0JTz9oVMyBt@KXyWEVL*s zc_owYX_{XBf(C~oXe42D0UFpBCUeW;@@MvawM`XcQmmFp8CxSnkd(nRvYs#D-6YmlH18A!xk@qh|Rq z)H!dq&8$arO~hj2^28nyFIzo^Nj;s{b_y*F$%_c%QaJKltHx2ESvV?`)d^8MxC&wI zV8NFM@%QVySP1ke?vV=KS6P7yto3IAnHY5R$uMrw9zX!R*R^!n2cJ9uW-*X`v?UZ! zK=G&4&Dn7eRq+04OQC|;0s^BxcG-(50$~pi%i^2hrnTD;5lS_hm)^SI9-{4R{dDnw zWyA4yoG5L=!e!lYKd79`@l_NP3Htj zk<5oScXQTZa!ERcuq!!d&35@#ZM_K^deIb2vS^*H_^Vhjsx0z6p`U|pB;dllvAkSy z_lQN_z)bqr%hieVwgz85#=kIOX7iNXvh~e8Kj^4UN9}7@{IXg{!)r1>uE|1OgN0AW zBB(|=9(NH|?Sf~Dr2r-@ufVZUUF$d~&2s%~e54F|vQ-YsA z2fQN#h1++^iaa1@?g-3x)6H|{Xw&al*UL~obekZMmB#G5=_aEkz8TFUJ^|ESA0+_5 zw#*weGP0QuC*IuAoKxT%XK_QCIe6HtXQ$}xnh|+~LRjRwv74@MI`eH$$(UCuxC!kJ ze1B$#N9#-LN<$Wm@{%Z15P+efk{8ZTaN`Rv+49o7X?04tjj9*?nXm?-!R=|AJ~FqY zi@+F1uDp|ESYACu)Q7YC%QL7Usvkt)uB3~S2aO@mH;ESBpVg+DHgjlrjmr} zaTpc@_;hBS0G}Dr2cUMYX;}(HSzPi^?84DYY3SioznpcNFUtf?7z&1u8V4U=HrlqG zZ>Tp-_ zbWL7Hc`gEb=#PHoh#%hCG2p7k>D&e2`7}QomTzzX?pNsgAVEec7=ZRef%ZOfzcK<% z85H}C5d=h%cL_iz-6g<9f;0jCda^O`AKXmdr!Bj}XTxkBIu+dkuLRtt5@)&WGgxn= zW!?CN4k>FMaS6G5#vTbzGTEmat>}$u?*Ef_{->5n1O(%!+wgH?WAVX~QlJq1^@-<@ zuJbn0Kl+j5pCD<1&S4!y$pa+Nso5wzFR$1m#)P@-D*|2X zfq+bRU9Q|B`hRqlEY4T-KpP+3WObr}yMfQ-#0r;~e#H`Nj_k8|*@)CesIE>wO@h#`b0qXs!Rhy-E!X|B5_e=Z@M$AL>5i zKb_-vSa)y5?V|KCtI)~gJ?-EifCB;L5?+D>TCZCVSV?0YG*s+u89PMW(yYe5&RIWC zm0nEeVqgzmc^CY*Q1h4U*l*^KJO9D_@mHF>#e?aR00N75V#9x)O4rGRz|>RFbQ7wX2HSRQ%p)$#>>66U$HKIdbrZh!heXP*=3&;41_%!N)OaB^fnwH(s{0povc z5z6-K|ItZ6%kfV-iHQP&0$y#)kCx+GKxFpw`>r!Nl#e)gi%*2!ADr|-k!*Ji=*`U{ z<|+II8y%G^YA}+c)VBxUUpu%DB4ckHa;@C*TZEx+!@exp4cp-;-c`tk>q@SFQUuB- z!k2-Y0b%DKe8n%yi1!zr**aJUZ+riPMS)flh67u|F})&;fo=bxBP0K&aDRLy7)X$O z898vpvJ?ZE0tKv#BWcyEV!znU_KDhap)EVdmGAIhe5OVky+^I&Upci(|LfxP zxtS0B-j!;$XEj{Q%cGW1Ig$HhB|+*c4qDjPT1ZInlw%kH}w|H}{VrDIL|_ZJh>gve&-`l!Wx%a9Y2B_kR5yf1m*WasB*%ZpEBK<-{21M{D+N zB-8F?z5CNfjW}OoJ{>e!se5QgA-|>Xt#OlXs025g(PPLrtK#8e_pU9hv6vG=3>kBu@jFsK$9z9W8X2`OZv5}!Ke}1gapr06BonabqfJjYa z@Lr#rF&(NZE7153GR^A1h#q*OY1#*z!3-7}@9eTJhLNzA#=8q|v5`y>Cl*tav3z{! zmgxqQO2tr%3g@CR>Ym8S#D*g$f(i+xlPT?@q1#3`KurCKz>a2$;=*LkiU9p>)xP(9 zFJ%~yo?bsM-R4B3qO+!Hc&}Zue2;@gFYMyXR8;lmF8jlW>o+HNa+?x{R^VrG3a^q-iRQ`E_UnNz%{A*^ zdVM3clI=b`L!2it0BwGG`3ctZ7Y3APB%IWRb&0JxFWKKY$8eZq9c1Jn>ce^CFIUwrCZ9T;Q1{LF zlu8{pTZocv+LClkX+>FhO7`4@P2_7!-$%P6epwdQCl`IseKm=v^X=;l;u)zrBA&KS z&wtA?=6Z$5;JSZygHdQ)%GO|Xxp&(8ytD49J2753+@ZRWJ|M&h#=8g?v~A~qhSu}z9i=g%GS z0)^b8%$Fizw@|(JIotq}>0#l^=Jl|VmQ9l;N)Bc*hTt-lw7%hxE2WYEZ@XkidDMfQwf5dX#VpvA|Du2)m-v+h?~GaFRhW6xoD&Lo|fr47Ot2D1D{*{g@{byEi7$6UO8w%ICa8E|`=_+w$pZQcB9QZ*f7o zL@`Qufya}e7|}7>W}%{gtPq0?M^@H8Ca^i~_}J5h(X|k zUEq6qMnwtm?m-%YG{IJj20zd7_PFf;*tsO5A6+E(t0M?e3@4KZD&+t;`!Cx$4yqH0 z03-Zs2sbu+-sT0Sac;+#?>2h23P`cn8{5RJr9R20T;Awan`Bnl^reT<1eFvKtg1Fe+KPCk~1=8@d+UKXp^ms~(H zOC}-3_pG)l=veetwLd#!Yp=o7!9n@0z7+!y*WN3Ns-MHmzw-;o`e0|8&*N?ZY|Uca7ZhG7`!%utAp(N&J;k2_{eqT&*WQRa45MQQti4iRU%;7m`d z?5d4fDMm*&$Cr37CMPR5jwW=G0j!K;f5V;sdYi~a%Px)-mr7D35;Uu{B{I%6I^9ug zjY)Ksr3RRm!5fGZU7S3=xQVV1yN&vkQTpY}s%cgFEsP`MP#b@ORowOUZ#|qT`~u%W zjEdbCT~*2cOA_^a5&R!Y%3|enJciz@> zCk2CgWQvOnUiu{#>CEanA&iQ6bv=@jYpr|xpS?PLkv8-?hfYaQFA}uBsjg{wg4%EL z=OPSP#7x#BrJnp}fF>CG{dKe=xQD`xnTk1L1 zdjId_{`aE#FGgT@H^BtT3{R{J67y#A^?l>C7I|lN zl0tWONR_WEc>gyvn*OPV`Sn5iB~tv07pe77&9Bjwx}Qt=ur~uy8YL_)1sW0PuQh`X zqGFUS1NDbyHhkW8w|A-7M!o`;D7jF)T=}pXL>d_s@~L9okR$m+7r?U06cSyQYFYEf3sUYMj&xKClA?oq&m$jyMsPWrok zpUH;we}9ADtBPNgypngVsH2!kZ^BoJhnzkZy=C#q315lhE)y@qzJM1dV&QhM;O%*2 zq_`m*hXK_%YZo#H2szf$%jf-=q?)8-#OPN0&QS@gN)>|v z5rfqoQt=7ybkKgoKkaY)`N99+!lV12>ac%pSs;63{e(9_2LGZO!Li6T>nukdF5*uNQ(9s=0Nybx@j)un4 zH%GN@I7#nX=fcLi7`?~0h#T%PeQlJs@(ZU=(F7yT76XK%ENGurCy4P9jB@pgiKe_` zwX+ClM*gEWQcxA3gmMU$Tl?J?{h^xr>7qQ-!8`m*+|3ikuQ*{%KG;arI+z%{tc@`& zV3?|RtP`GIisO8R>aASxCRJU$L-qkk{Qh!OertbyRa(K56TmS%QNdYZ7vB2D^&Sk) zTiL^{Dx&yI9(EMpj^VaybMEHiNG96a`=`$%JrN2~nY}Ry-M|9~>Zs$6wCLdzMQr4} z4~8c^CRLed_(9U6=Vzo6cz3!%;MvXHFFZip{F6|aJuiYni#!~NR+0E(FmRXNHvvCK*_bG zYl*6pkdc=O-T9oVNq5_LnUjKvcGEgUFJMK19BfEh00~ApZ61GqR~bgK9y1l&{F*-X zbwKo|+wVHAVn368eirlvQMpxyfnCS)&kb@+`XV~AE~Y++L$>V*Z!kvEYYSDFn_t>_ zCeVqzeE;a<^Z>};;E#3JZ(sOd`i-mwTH}Mu>T1zGVcLh|859C007z$ddM&;YXgt0( z*Z*_l@z4iY6qxrf-T5bh{MT|cfDay^VVi0w1&?`2E7=Zn=%A@;)D=e}~gliHcTy<18)BWGep zmU%^C<A7l@9T6# zXlwd;d$c%By_dfzr9!*mi0H6(9tdEP`BNSE*9G_Ig$KF9TNInY_ONsChcv)s$qDDJ zh4*6tSN-WjhG7EwS2A8G8ZEQ%C@#E$R4=%ZUdy*g({CSE>oE&7tkBq(ajUkz5mvxH zh7gYifIK2tO#qq<2f%!Dqi_tvQQ7hGb2xxZ24ndi;|pOG0Q!x&5haBu!2!j@P#qvm ztV5_@0s~k&G6Ff=!DyoyfF}R$F7)uyM-e~3dO*Q01U!{;A_@SUqnqjApJ5#ctRxtx z1YXa&_Y)uxJ{NJ27t{x!lkabbA@`UtE(9u-?;u*0UzTy{->dldUi@1w{yWadLMpc4 zgcm1{E^C9+`KD=($=^OSwMp^tIXhTL7So_?OlpxC6Q|_!Ex;(Q!^-|n<~@5gKD+gz z{cCMYlJ5*+SPh*#p|=qC`}Ay2F5`qFbCHd0sl&_6A6;SK@Qomb~e=RHF%2@(3STL zU>4@vo^Q}c4$0tHOITBvz;S7?ws#kDx!ZMmyDMgC(D*PnafGzp0N z$P3sJz~u+Z*A5r7&tnm$V|xQ+T^jkTjA>J0wdd?HT;}0%Gwe~V1r#aQ$$Aw?K!F~A z=63&q$>H!ixryT}$70YLozOF)`RCxRD*0+IE;8KY&S~e(y4`vVUU1!)3s&(1?ZB{n z@J=V~`Rt8DYg9aQWygX`c&)ClT)TH=HnK~ytcR{{P2(1W21kX8C_sW;u<}O^NBGv( zRO`6ode86$f8QyS2Odvu>7?Ed1fBj&mKi{d&w}_dpHJ5ib^r#?-pX z#Hm{^7+_nfL;^jWS0GsFi@`*zRexg1|FN|FIwT<*MAi&jKN#u?or{N3Ay72<6mCr3K=X7Nn`5od%HqA)V&>}D zpvS0}5hr^@hf4qUhyOvKBfpD+Zjzw=7h<;~J%NgL(6yS0E%{4+68L+ZA$H9JS4Ege zZ{1nD?G9S4*XnEmGDbIllkX>%YPY-q)GDF@z6wfg+0FOk60yo$Br~!54cablENf*K zLW&ihzM`n9sTnu?JZWOq;o{=L+Ci^#@m4JTJp+ALpiEDu4odwC$69{k29{_JMibx* zs0;?HYj0--Fj@(u(SC_v1x@psEDe#u*PN9h>b6T4ciaX*et#Yik-flO!e=+pp^e(% z=j63fax<3xh)9l$8SUu}$U(M9_R*M)K5~8C@+IYu_qz%th*tlSTT2R#Xrh@zoA$a=J>+S_4o_=g?5w;@^IN5Q~7bBqn^;E<=$a{Po zd~uAx06>yT?g1-!5_Axlu;KY%(2`_K=3sfiGY^4xI2A64Ny20`6niC ze-xD7QFgPmpje|=4U&6D!WpAp(=1@5G25Xk?}Q|Q0l;D7WR*<$UcsV$L*&jG+K-u~B;^9N7(mrn`s55)gB zvpoNEUGOhNC(xW+3F^fJ4ftiZvnxX*)3>8(;w4kYUPwxUOwK)!28+9R?`j=EEL!Nk zF)#i$Z6u$A65Z%SSI!N_$-7{L@Uv&02T{rw}$yjuz|q z+J%=^cV_ULtqcIX{W{p;FNPbwKAUpDh;#pgxX{+)DodUZ=!lpmrXt0{6-&O6`$mnG z6r5G%pmtKDv*d$k*J4AYxIq2^4n3?ns>2L-C~CyIM(2pKfn8$%I}ckipf=XpwZSui zXEL&#N-R|=iOl{kIkznfXzvvb13uv)+#wcMEdvaU5`VfMfZ8M4o!Gp2a(5M0dF=zrh1wh^GZA1CIA2$qR~@15 zwae7aU*~i&l^f!_kzY!`Y^Z+D$mPwI_W8GW3uUn{-YPmT4r`2kpasCPyV2Rlk~EQs z!-eml>Iv-wZ*f8(fV(4tFZv?aoq+?aI0{EUfc673f|#It4q!U?b{D_{m~$e>@*M=p z`@!VCL$hCd5orEhgJQ+NO8_GI*S8$PzXU)Z*}6bK3hapNc={Rqvlkf0LIVsEc+@e7 zXN_n_o&e1mQ!9i|o$$dz_?##%6z|jxU6G9^P(~a@;N<6*39&clUGPAuV$&0DYXw~f z#se`C6bUtp;5~LA&?3EwYa?*Dd3*;czs5)raPWf$Xk~f!?ivhB@f`%L%*>tUm^uf@ zU0osoCXq(E9^h2DgU$F33WA({A{}0rv4;%z=XHbbR#CJT0&<>*E9UQcyr4p0-5=lnZ<{e-*-p}oH z&uDIkn(y0lG{{EQ#QlPfU%f~nm`Ed)Hb2p`rib1b|NyCXqY-U=rvCYz*i_*WceYhkXZeIv@|W2`O};;sEJyTHmhn ze$eHB=YQu7ap+)t7J6|)7?DVnh6}O7EcdA7mj|g+>JvIS7-3g(H!{F++_T@;=5pStd zsx%bh>PuH$Q%v7`v=}`x^NgOrH6J+aOsYw#J6OgXKD7y7V%`$|rglH&W=;+%zyuD= zU?ks$@m;Q~WpR#|DYEMt{(DmOUqZWoDftvBKFt~MZv2ao9Pv}}8lQ_dlZ>;w&j8R2Y$}-otv4N`eHf|=BU}rb#{<#f`T=*qXxsAHxo)#C{ zO|3g4fx)ePdLw0YeRWk$k9otbKY2Gu3$#6Mp(hf_F8E@9jd98_#mdFn+WM^7w3%6Z zc?YHHYaLp8(7RV45HkpL`BcZBr8dJ(T>+S!DMd-JG%ZkS2_^I(keN7R&%ri)-smN8 zCZso=##Ll&Xgg1ZXq%q}&Hkl81BmHPl@@w=Smj&HPTG$+FfpRlgbbfGaHmnfQ|~NZ zyl8U~WE^~E@$mo%Xu$n&VvVIccqW_nL}V{|kR(7Ey_cv2Fd!Q_)uwh_aFn-Bx&4-! znEmy$OtE17_hg?vf4gTyB>NLz9+rv*4Jvd^&OZuxOBaynbb3>1tBaYq{wGf!ZeIv> z$d2&Kj(>o4e1{cUZJ6nd6WU(hjhQu7Cfsjs{|?H^F!9C}7mK-F=UDi=Na5En2}}g| zn>P@P3qSnI2`V7Db3N*Vc@mzGU4(mq;oye%h(_cc8xMthBW;u9^gKw+=X|(nIx9;|vpA^pj2<++tPiq?8gQd+y*sz8YgA#@D2Qk= zue=v%`_6cJ%`dQ3+tKH|_UX6o70MqO?rkWNzN3p(E$o;PKf5Pz&!P8;wY^f+$c)6s zM9&L9pI97U5jOm=HQl+<(XD8;*K}=&dHd-pBEA(9#pCfi9Xf>uLt#mYGeAF3(v_CE z)g^{i>7hb&&&DA<(K?&fH|+*GHI27=UP0>u1@jA?rR0~d%0IICrN0|nMWHqY8o4MN zg*uk7PoA{sd1N$42TEE`YZUpf8@o91oL6*X?W2o}i4miLyc$nPOxHCNPc7m9Tqt0` zkjFS;kv{aolqObQc;0_gEV1c%3QI(b_ItZ7E<=XrD| z%)lg95?N?jd>Pv{R{4>n3Jvd$;c88&*@#nhAn&Tk!)-2;S1%yzuNP_4`A1`o3tMKc z-bYuzms}2SS4nmAw%>Dn&4`$%y3l%j`*{1QBzf^eVeg}no|lF7A*F6>{bUR3)egSC zvMCmcYxZg1u$d#YY`oOeUOfIhjTKIWGnI)E3xh4|k&N8f(A;pVV9Oh!7$Fctdu0HH z_rcp^el*khk$CZz!NoExpGR%whssf>`^_U;sQQ|hH#;Om*x9vj6iaihT83G(W6ZN} zno7L%OUPqi96sp>N6g9(5xBAUX&CJcC!U#_t8qJ|*%e*9+@W=&k~(L+{=ILV)!TQY zBWj0)7wc}+HzQHS0TmnOZ|`YgjlPMJ%`Y8=zZNl*^eR7j zewq7}E47%Ci{YCbEd!nkO?=nOkXCJQWI2M0Mca73?#H*J(YvCY3x$8{urZ5Wwi`a>`kgWB$y6ss9UDRz@@G| zo7(pw?pVe~ls2p|E>E*Te3cV#D2)Un99!DUGWg^Y$#%bUDY zdE@KRQDbyCTqLH*xYYF6+Vu|D)=1q}&P240Ssgb5sR$ZiY7i7BzrHT%yvdP$- z*s{&ZK$m(vYtiq&vJ+;RHdJ)b-~{VkIw&bx^`m|XDJgL{XJTR|vTFOXm6JL%X}iNy zzwqo;kGC6XdflyeG0ew9ZK%7;G7l3Q;ao4!$WCZDL}ZLK8nw-0JcATxP=(k{-5}K( zm~)w<+gW~Hs?L&1M+dc4f8$b%gDf|`aCP=izH>Y-MRj>(ZO`6svN2%8t5)jTxZ%#s ztTx-Z8{uj^2_bJTvk0jH{ZJ7LW*lsN7z>bf=+L3e4oGqwXftvFUXDl|R zG`xqMa+`*}PP__u1z2%VD>!&Y(e2wTHhdgPH5Ib}pXR3RM_{RKp=ef6 zG~rARK*Y?bz2P(`Q;^;N0sWQE*LlTO4$^Z;6=*;)?xOEdM_VFvz)cB`RZxf98nS$ol5;K zs_`>yDtmsSb<+c7m^W8)p+ ze;pquP|Pl^yjd?|A#?)t2j6v+aMGn>*cLw6nS7jr#VaWAMuf#B5{B2g(K!cbsLwnxK&9d^yRv#^tTu zZ0!p^e4I^1*v0qHDl*?E1$B3tai6;qP@Ke@7u7YE6BYE0hj()#WvUB8N5Pts#|luw z9I-KVYE$C^v6|?u<$0>?u1yua^p77UDeOI0bj6oc%R`ySm~AbucK395(v&34i^WVc zzjtqWTi_3|4W#qi9AcL?L6u7y@TpHs8~T{0wOg3Y9e$+28gpF2C=5ndUs!ovKeE_V zt>M1y{`7uiD&$mIe^=%+@=tD-20{&4ZW(y3SV?t()yZ0Pt&h*xhKF*Pn{l=^NpAx# z&KgI1V$g%ShJ7sB85Uu|n2+=I@_ZiDPb!7a>>2Tfm?z6)8nH$7R_-KEkJDRcUv3We zZ%-X;PQcS{x;TX_A6O2~OdU>)?n57E(;-mK0oO(XtR>%>S*lvj9x(5gJ99}p)pKvk zUpbvfY?W!##I`F$=;<7!d^x^)1OcSBE4mGDT2%0Pd%auc#_!5-|JoW`7(nP*UGN0q zZ~@6-f^}?_-Fg~url>SU&O!B+`IZ={ulC8^`+-B5DJ0^DZ;5B#f?kokhzjkktCj6j z)Hyjh4dtcV=bmA+Bh?V+01+nwU2aUlaL2OAllkn9Xk;kBCw_6tH@;mqFl_2c+bX81 zMm?d)uDkZ)BVEKcQTxL-qWT4TqK#XmB*v%6TgX0uY_KkE92d~e(T*qtITo%pB$2_x z?{u%gH19*x#Y9!r&m`uF+Qfyt!4fO~d70$5YG5Z2b`XtjPC8LrfmL1{lGOMBH$(3= zb4k_Ai#u97#|k`#7+(>(f6rVK2UP^*FZm< zgWJZlnGT>ik%pto7Xri?2CAoRx41zK_^Y()yA=<0AGv2DTRaJ-1Vcz;`|Y?~=^A4{ z3OuS+XV>EphLjkj!`kweqqJGDQVYB#LLoHlYsXLWwtR1$S7lJ!4&@RLE4d#Y_|E!3kpQ}nK%ixL9=mnxxsmuE`LFo$iUT>m(8ka92rr|Us(6uuqh#J>SfjiWj9fEFurWs zWeHbN0~t&0lOhvnJ@m!ycAM}8JuLr%Hj$%g({5l|*RhA`nzF8Tnh$r@1GCD{s5wjc z>g)Zo^pCn(MwYQ^n+7JwwO&VM?fCdeOqKMsAmG70s7LGRK51wbOf}pYcG2D{?W)K5 z0`E^@9&thj-DTKgsXd8>-Q0--0?EWQGJ@oB5{@4DQ4J+}a(WtI(m7Qm_#Nb)N1(Uv zMDFNV!PgfN;uHtjz@M_fWmAkEtut#Aqm z^E#NiibWEBYK7W%sN_Kw{Cx*o*mM2L7xbiLg~FL*_jYhR=0Y;7UB)F38-2tE>WUOv zmW`i;cZ?bqNu)XPfa#zd_t2D&}SJnLnT9y#`bl{j^`8v zfUyr}xyy%nuN}&!bnKYjITNd&9T}BZ#eE z7&4|fb9eclO)*%|RQ#l-kXhpIXF6qi*kGa3iip03qz%u@L)z7{f}cvL-7<~KkaND8 zs7hJ#xZaoRRm;J}tR}=&<5@d(`O3J}EG7omWUR-eQJ#Te+qhBKc-oGm88id&&fddx zunT&AZI*vDfCn9q+Gys>Q1xKg->B6fsd5hud7S~;A@8}D$##m=F|pLiFwb76MR(b4 zX0{hd2fWyvt79S#`>5OR8_)cHt@M3Jbn?{frm&G#+=P++iq|Y9rXWfLC-=?~ejdl2 zMK7x-ljQZ~1qb)_&C3_>US4$7a&@ewh$<1#EsT!f6Meb26|m%$ICpu$4&^SdG9H*$ zvb9sqyOC37v{K^fS_5W-T=wnd_Zo}9Y@mCe%MHz`yCb|AG`J3vP7`9}W>M8sq|INa$0fN|~JlvfT zk-P})y5W5b5m@D#sZ-Z+pZ%i+>C-Vql+UR3Lr4Wb_wABf%}tqC#Kl5ljay}>9HeRT^=tP~LUp7(G#KfZd!Jc!Ez0ahBdwM}9IBJ}O zqkbG`kG5RU1~{S{$?9tmYRr}=F zw%PMtxa56!8x5X^eYw5th|dyeX@AdiNOh5`qhHz1x0-n~u+jN!BgNObIW zMW0wo4-?o3{eIBsYTvw63h4q0dSj$etPkhA7_^w`^821LLD7yjlH}H^10Zx5esO4S zlrS$HAGiMPi%oL=LeV6TmN23|U^Z+hKo@($lWe*Za3)Ig+5P%Js%SkfyyM zJah?e4^wpXake=sF<7v5CkM+|gjvO$7+KzX-bBJ|knCeGIjw;(;{cy^P4+DSKm9mV z;$nM7U1UmqSj}rl4p|as`h%FdV>hp79?v%yHh`*|cT7oL*S*z{_|=_zC!!xADn0d; zZ86RWCy_XHzbZgg|+?xU?M{u3pk9XBs1OTMmz&^;BqtwOydDYl`Z zb!V-{z9|2L-qAw~45U^xLPGF^1N((LV_9PcPhN&uTDMQ$y%PO3W_g9m-m)l1b5zH@ zR;j)(^={7ddDN2>;}P8xJ?|)>{Yh)K;1d-Fcs64!I!g%1guoW~!yms1j6M-lkglF- z8ZLLac5AS^IPN-M8N&`xX+K}|{KUly>V2)cT5})Hbx_@!>>9ucd{Xw=a;i+K%A<;h zxyZd>tWenB!5k}z5!l)8Y8Ju>;N&m{Q`j=okTK`E3Q?+L9b8l;#Ptbr$VbmpWkOx* z#_838$=ULWAF9;s%kuVRu!G@+L%mmwnPPL8!t?mu>!Lm6a>sIJQmgw0BX)yE z(su)tlWq_O-lK6>l+)`X--wq-c8(>5Oj{g#ggC7AQSRE~9n(^j0t~D70<{9D*yh>J zgqv^{ZrYk^HU#wSXSYRdWuVCFYDrj^RcmW@Ilsz~1jnMB-4Q+O))Udjmd?g;`le9}H zChUrwaYx}o#Ru)nkT-semr1Efa$LTQNbo-h!&;+^AuKc4h&Df}@Pv0*kKU#VK(89J zpOW1&f5qm+9j^=;fAQ&5RI6;&o1Dug`6&plq_$6d{d`$%B>x|K?;X`tyKawypeRyA zL^=^^QUs9>fuMYVfPnN)RHO?C(g_4ndJ_d;{gZy(c}odA3}BR7VuUf=z!z~h&R zMX#9O8ec}Hq#`QBERS=BI`qR@D4K}&C2N6Z;oca9=VJV6ZT@8w_6g?0l_@tkvjjA@ zQW}L6Wai>~aieY8**8*jG~L{rjt*7KBK=YFdAXyVSV`v0?9oiO>O~=kdS6rhnM%`j zJGFVn=v|l%E~WjXwm>*}n>R=+C+np`IQmXUZvN*Im8Hpgm7C_-*MyV_r`7T#)!ub` zF7@d4Vbj+xTD~jVx)S^tou3CH2Z`~t6o4QG-g{3Pa(bm4dEi}xYtknlgK!lCh_P8f zgz_`2;w&Vcob>Qaup%2`^Vb79L>xX}hqwyMVP2?cx@n5l$RazZr z_)XE`bWVbJleHuM#IFPShSg}XCwHtPP13RaQ1<9{qKEh}pszYf|A+hn&^l29Q3=4f zzR)+E5xMgWpbpjbp)$Zg!Jc5*Zwi3$S~rlFf*e`EkkTGz*vOvnzo&R8_a(L|7iL&H z5!H!pLyZ2h`U$6c&9#kM+e`dnp%)%ATvS$tga>!sizXmCq$0pY*UZKuI}V!3oYLkgl{*( zh}rUU^%5FfO8AVgi#o`0{b5Tn1~!3&A0V&0Tvhw*GA!F;7j4Uru5?@iHS^~}BKhg3 zj)+$0HOj;^#0z8$jVBgaYYu~aF!~59?s}hF+ESkv`?1x~fE8qQ1+e>t!5Ko>2b9wq zi(NuQkwN|k>k={<;tp>wu@t1{(KtiJUjBIy~P^QEIQ6HRy5p8O(OvkEA6?YUmQVWXf1JV}H!sKc*}r0|x2iO>Z~4;3JA{q~SKnW3{Q%`I@qqOfKM z4G;PjVP>Goyz8UCD+vSPKmq~2~k=fXM zXa9Ko4Sp^*{Hx*DP`Ru{$@kNmDhig)0#_kX8dJv?rQdzEJmZ(L%~HUuT{B*I^pOPLJ*Oc zYZ2?$iDXzqJ>vBp8~Bw=e6{X+eLZph9pyTQUTKkQgGpL_-mIqIlVVJ_pQ|T(uMS-d zJGcHv!;2@wGzsL9Kdq6%ATF0kGOl>WP8HGGVQ+T9wB0{OYcy-CC;Hy^UM^s#=8fC7 zM=u6F38(Nw(5W!O#EB9M2RLLK^4vNP$Jjf1{4Y#+K-AjkPp>_|9j$-%fb!SyKa%SH z2j2tTBC!xn@U**xHR2Q=*#DWEN2OZ})@NC9*>&Q{0?Th{#|D`0zbM#F!i79pz2fD)W_CKJoi2@+W!rZ@}GvHh^5%{E)NVV z!H}4a1#$ZCw~69k&Zm|7AAPg-4;*+r0haLS;&Y0C{T!j}QxNHJzuTduO>vUG0P6I( zK=L34k?miljBQQ|F)> zV-W~D8DxMsm%@%nVu|ZtN-yb0A-{bOD<5|rLPHc?dXQSSnqOh|L=Wz|EnMdLu$9F& z(=^oE2Tk0=;sUUQWxBG%3J<9t#ml*aN@H4Q7&0qFc79MGRZq&lcaA~OFxJ*6WaC&% zJ&+68ZWBaiW$ccJbcz3wMrOx#e&Yuvc?iaNK#QFVLFQ+rDO6UcI+F1rZ^KT}sC~HS zuBSNFncjj(hRdjV%--SBo9dAG`x4|?@sN?B3YZ$cC{l&NU)`;+UOm|1(48&n!tftY z@84DZ0gHRtEV5r`#A%y*A>lU#bIYt5xgihH9W5C=}#1fd0Kw*?#F=DFFq+ac+EUYP^QEL&&jW=JBw;p4}i1ofFY19 z_!)(nuNiq+hji;WR~Eo=VmX0^O5pfCv>Hene>-&bHw6YkPE)f!J|e|#z>o*|fwQcD zcdFT&-xS{wCoYc3#L{{3;<@wNFoT6%4*#r2^)}*YvW@r(uE%}&eQN%LVG505ue@AV z?gtf|Oz-qJXlq20-Fv2*Z@U?4u4U+ho}ieADC62j-`Hri8ppW_h&y~*!UQ$XW;SjG z9<>KswFj=Yf@(<0W4901ep6JH^Id8{32REgy~2D@lN!h-wuk@11__iE{1q(C%`&1X|5vjy4Q9u@;01#U{SyntQK!lN2r>s zlUhFZLeFYu!xU}-0&q}=E}J^ZmO}Aq2WM-Wc6S> zX7Q%EQr2je!o8;4__MjkIZelpkrnR|)rGAH z_Og6mc0WYb#4Hn-SI|Ry5cwYE3Lqfu7c6A-dR>UU-5}DQ@%ZM6<5{o`fQe34NJJv1 z`2bQ=Y8wXl3Mc1Q5CN5VeqIwX6#!Q@JD~*X7C;5gXq%vPHd#c3E)69uR2*|Z24eZP zA!oOFJaDnCnUD^~DTF32?K}VV^x^4qcv)2Z(FPmU5Y5M*f=x|zm+2f#Xk}yp zx^y&yK!O4dC_JHbMPBD+Yqw^VUbc~bPa(>uVY~~8SWN%HEe6#so+2)`p1d$g8N48h zF@O1LNQmlHc2A-@Rh-~HT`t$a=XdEn1!sY|X*>v8&oj6^n+{D}&bCKjb^&ts(6?G* z<{QL0-b@c4+0IDqDjT!XXFmC*NZm7HZ#hgYO!>tNry&xu2)+s_dy z{dXo*6pcR%xmB=tIxm0R&5UzC%ka70TJ_ThlOS1*hib~~$Sj4ZN2U;PsfP`up_^yP zmmoB5uaqTrLVhsncTv$bX=e6ha|wouM*K2w3c1-NdH1AW;c>~H zib>xylq^~lX~SqaLFRy|;4hR4)|dNBX(EvsCj7Z8Bm9oEcchYfC@6EkQ}|Xm6w=TG z*0TlDbH0g#IGAqsGSMfz?P(FM%l56F{EE^VL~yVVSO=^5#v2q4XC?EvA{e%fv>Q+| zyibDcofmOF>qGj2XXN}MMS1I&bmiSQ{&picA&QCu>_xznzq^^t2)#w~|2)!O&i&kosxqp^o}L<^oIHVL248)=gS{H;DE z`k^1dP{^x8zRA+4<>w$F3+bfvH^$agN6~@za};`7qMeY?A4B-EN}?knh6pX0Nlx2$ zPH`6HJuSS)Ra|;FqZsnEs#Eiq!~1|fS9#LbZ;Ao#00FZMoUOH=5XQ4^+LQ;kRR*@}zsv_;V7*T$M1Pmv?n z>IHcn0uW4(*OYp{xOg4!?RQ!M8Jy671BrwhonX#ZOG25?4f8T@Oolna2}3oV(!B`P zjV&&Dl)XFx`AKrU**3(P`2=?`FV?!q^l2t18NBrTxYMxH+N zFC4v`sMyzby*8JtR$=X_zN|f=vtvK8E%YYvHY*ZvSPb_$eMZ#2w7U1D$?7~RVkf~9 zp`whdXmRJQ1I1H>ocz*sm2=Cc~0SoCH96*dmj8Frfa8h(B9D_VD)}tFbn}kXX za83|E?mhWGAdHs{$oAcfuCt<}V2@H~o{*XkrR4~p)!vo|FVtW$t(*!Fe~zuZ_fMUk ztcDkf-VZ9h?Y8QA*7dDS{@z&Z$d0#3*Rz+oio}5VxoAuIVVK)TwUVhXVa8?-n(;rn zmb`i2%_p_8bMc7QQN4fah0aA?Rxvw1Daw<{vn{yv(#Y2l^eLntMCE;%bMrFjoN8Z* z>Llu}z7M$aJ*L>qaRxL&iwHb$#jChRWjiV&FE>CgqCaTOLbK?#(-Zv+e`i9j?Rs~ElRVN2eZ?uKMCJ~Hp8&<=~{u9;iO!h z4cZDyKDXiX*eKh@B&JypQIfHk!${y6A#n%uU-b_RNE?})@GjHViLDYJt8$;ZG?VT1 zn?mB~`+(c4icXntlKL5yDl=@_QF}rT!0k z)3d`%O&8vR<7^gdn!iWf)c?d6Zm{)%R@l?QLz^q6=FHgY6y13I=B1diNj;J}5E=o# zy#P#TIr+|V;#$(L=M+vwT)}kal-9>e_V6cb@h5v}RWP%}l*I$F?CH|ETT3N|CeK#o zd`qmDUd6Gq=v3!yWFta>gq-+IR&i--m(& zqHi+tPJfN;+t0s6@#FSf0R_MX(<0{)&`QlV;hW9FAy=F;%Zsjao@#cSuTh9iNVx7b z(lv|!1UO|fb#qC6G0-TE`epde8EhkAs6?T-J`ZTsJ?g#YT#h4psz>DDB4 z>s5tKqM2&tAIIIiaX>3i1sG`RpK6lFa%s=rOWQDZkRdq0j=e(K#_Ex*oZN{jP-qHC%90T{2{)Gm!>R0KD&@D`>2ht_EXMWa zu2+as#k&_|zMJ7nr+TjXnbV)O1nZygqCQXLrUy8C6k7~E@ncKA{2tBz=-}CtFbJRR zmrq5k?9YcH^DO-Xlgf0_2F^t*HJUlFYeZ2;oY#;^*zM;VRW2zS+mbbV+8UwZ4?UjG zS8?G{1pxuJW?YqbwXFwjO))D&)r;=3C0!;?gT~F-&V^+I1HRjSm9>4kzSGSMjNz}_ z5m#Wkcvd46q(+7w_T@!l)W0`XYQA3-F<)^qG>a>SyiOHD^2mtg{aV;qcy~6nT{xG} ztOWXG_x;#z|M~ToM)arXOTs|_J6#8joS8tb%+AocNz5~L(zEKBZ zad}2W4=gk)ougnS@2=JvkTf^I!9Vhoxo=+q)X1k4IXm>+%;sT3!E>&!H7@&dgtC=f z)ZUK2yt{boGD%HG>BoGk{5_ei@dLLm1Eicdf`!Zz8L+T@!%2l_Oqv;jzK~;2kI1U> z$XNcMi}CWzc4U`X8!6gy*2vj?mOc$jvx_|;-z^uPZrXcS7KF*Db6pRD2636SSG)n# zt6-A~mq1Wd^srCee!7P2dn=2VZ+iqukI%Jp(s15`NPyYI>NaGG6upiF2yRt}$n|58 zK?m#!xpJB4!qx!UzGl)~A=l9;zqeb3l$zX`#lI?sQA~IzUV#!{FX(r;?p}gB*Jo~Y zSY3{Fc$=O4+59|L_BjRs2g24Dr3~h1nY2#4XxTt%Rz%{d7~@*UTJa3D@(71A_I>#Q z@qv8I{sh#|uVfOY2$)Eag#Cg`{OhnPdDz&}@dpzQu{~iF`H18@9p_&5unnH~ef%}` z>-J!cnJif=kZ<*SqDtSsi}#xSFSdon3$uxthR6U5CGl9Jbd<_D7vB~UBpwbh5m9Ej zR~shGdxD?Wwqa+TFZDbw2<1u+pbyiN%r+CS{^VIELewtLO4~p^Jo*X_zF!X2zdiKT zv!gBdZMsxkG8&~-hta4tYfflvwWXe{4yG|01^X>tPGoKQGO$`Ru43)VEZBav5PL2u zdy!k81ry$8T%`A?Lm&OpZyVh`xEip-${t$~-TR`Cwgf$0zOz1~)hIaUHh6Vxo-w}Z z_Uby6q2=y|CC_Vc+@_sO@^nqhVltmsJ1Ep>Iys<1ML~m)(f^6-_tt}&?X{sVW!=ZN zu6;uZpR~A9?Tq0KLTzUv*fG8O@5q*Mfm4l-#Wq|6J`!Mr z>fIL#8^CeEb@<0);rSbjtiFeCJ5zfwgHHMKTOAq&PQ^xH)x4kx(Y_&dzrx-g{v_7+ z#0V&^;t0>{9CshKxf#=$y{Q%x0BJC1G{(-`t^3R^|*lGeu|MT#jaNTrs^I;?X? zwl4YW6P&kly3oXWF*K7jsRgxnt3{0u1EX!5_lSsvXqZ$*G}Gni54mCiZy!=pqQKt# zOw(6It$k^8RZ3WLG-DrD^CV`8VwIuz7u%HU=}h$kxG-meopQlh;J zI#+kbr?cN#ZL)dgBU~+KGNwtlh{3>C+}`RtdxeZ5kDW?a_7M`Rj+N@_pSw-}>hhdy zrbE&k($YrCYzMo!k@uJVtD@kOaian0RRmbCXc`G4>S2G(IV%A03k$rsozf4%c|cEv z_BEP1Zj?I?d1f!?_H$4w`yg-l8;JT}sn6?Pa&{unAH_Z+jq*l1@RRfm~ z@^J7BbZpa^;agBnZ`YOUI+~P|rL(-kLj#6k9?bkaLD7R7jB&1)XyauF zX*nKqhbnn2o#WrK)r`Ou;c<(~+zKl8xLdouHKXeojh7seKX#*Mg{#S}`0j4bzBBq? z0fb4k^$RUUs<(8ZC!qm!^6SETxX_}tvk43}i37(CdsoiR(ONG)@- z%l=!ZK{%U=$%V|(Q&B5#e=g-N887R0miMTlZAV+F*rR%%EJLp;0!H6xSp4Rxdec{T z%nF~NXZEwF&BCIEiHSuJM;Vntz~LfsVTdF$M575fTOqa~;#dpv2m#Kg^iv*F-(Xkn z8Y|ukcN{hjs3@HTz>Cb!!%xmq)rcI znCct#dn3b=ogp-bjNxzni?iY%_MJ(GnWRee4G5g4$@G$ZP;85yc0uES2u+0%0JFW=pSnSv_sXf2g-NR!={cMq%GTAfN zdBLl~-fB}mCLwD1r=`e9x}}xb0JVRkzA8G|`NVlqSW!mSv27_}Rw5FQ>%ZeM zml$lM?@dC?O}GRYBdUsGYh0D!bi~p{m9i4z)2Y0!+}qS1H+eY?@@KWEsMu-7DP!=n zI=1;HA`_Ux2CKvR6Ka@7-`7<8gucD+MxEL~-Ij8eYcyE4I;^rOA95L!#&C1$){IK` z`P5ZahToo#4&-;DFafEa4;o+KFnAPJY00-oG8HVBZNkee4=B`Pb})_b0GHw)|4pdm zmMeNTXQ7PI0MAn)AJMK|ok`eGBVFkTpFy5CbfP+bjbDwe0fC5%9V$u_J#w>r6Yhtu z+1X?Q)*-e2r2fMNf1ysx#8+U)55zf5c_fZd5t$r)x zR1v|HXaVZL>|Rm#hKUwcn*x|hQ-;F@-uUzjlJ5F99``c%Z+ZS1{`3Y0AX{Ox>juZQ z$crj8Gt-VLCP&dSERi0Co31a8NkRZ-1uMl1k!e#;pOOC9e_TflcV6HWuF8Vauj4Ux zXH!f2;;cNQ!!-SX+6@^G4l3bx>{gMOB&mnI%qp{bl(u7+iv)ie0N8rB(>wlo3rDYh zQ#gUn`D3uzW8Djhj4pVWy28tLFP1Q`#kBcA{?a9EDjk^=1#`ie3dPpVV8_-#X~DKW(K4n?y!Fon;=3Y(mdW8PNmhnzhum*0i4 z^(Wsk=)3Y+k47+5#SoB7eGTbA92guzNkSe-;siyo{`}l^r(9)RMC2lZZo0WE5qnz} zV>-6tkCshFV+GO#MSk2(PAH;2r^rFkMr_7sEmZvQxX^>A!Xl#~Uk@(2->eBjew((& zT3ou=ctbPf0eJhzt}2gkKG0q~L_WR!(xhrc8X2D(yz|kMyQ~_Di{IF6v0xJsw;ZQT zR_(bF2n0f1qg(;8{iblPcm=OI987n_M`BruA_^z-(yXX+(E+f-xMDAGF@{aN*>f`V z4w*d!4XOfWl-MkZGz&$YS@bWV&05duDkBCrWPRc<*U#{4H;JV5Y^(C1fIkE;M;JbP z-$GAp+x!8!N*4R&6nK`bF3|aI($Da&SQPgqC#?OlIS*qX#WeBu=|d93*~e9s8=2mJ z4|KIIqO)YYU)Jo-7m(9F7?J5Ttg&;^9*sD!HMqW~YD;<<0|NSQvsAZP=0 zwdO*@m^**<_dSwwh_QB@$qzx^>^pqQ@{4?#6i51DNX`~+@~|yUELNgz+cH?^e&&kn zAGxktw}169QFirExL?|gp+FNO7p){BdCsF?K_$+Xg+;C(%yo-(9SeR4HhzCF{_TOC z*vIKbitFskYg9|S#DLw1iea4vZR`Y|j;#s9FABKXxpyfaB_ zp-ia5(AliP$|dcNZB6`vt<6A5vJ70rn9R=VJINh&GgMw5%?RvSTrk%8EE^pHoTzV) zr|GELFtE}ooY@#&$G1A!rRL7}h~E50msB|OgS-!fMNoBuFE4r!g^Qv{t{)`gRp7Jp5)GAn@b_E(CI=sA=c-n>lyThzZ+tVV!-E0Hv7F_{ z)9>Vnr^>8G`=mAr&T8FE8W-t!E$|^WSO4cNsL@- zZ!7HIyn=O~x{~~e(YTh?1^xEK*MZ?>74c*NVi!icE-34d`UGX=JL37zo(!`;?qTEA zxuS84YQ6U#%u|9j3C(ADD`)8{%<6a%RGTo?HRCW#`6j*0ZhTSZe%fmPMdA%!i1#o} z-G8j8`HyF#{yHheWR0=1#ppN&cyJN+^VG!*KRiofdy@JNcaJ)Zp28pT8g9opb+jPf-+lwQAI34A80zW2Xx{NLO2*A?`?i{gKung6X6e<`N_`zZW> z=26ID+)BbTw$3V^H63u9wKa5O%Al=vwKw&v`gj~olkWreDn|-~k{AVSi@tK_NY_&n zZHCQM@zTc^bADmP_W79;PGGNyI8Um30zOC z!IgG`=-isr#ovI9inI#!;n&VN#-KftUwpQ>=MeStLrSPQ7uVxZm=8XSd};kughYFm zsv@wN%SQB<(Pwg2VH$UQZ9Q3@-`L2UPiJak_=n_oz8l>f*icTKCi&9tsVL0%>|F@Q zX09;Nx4|Q-272k&fJ%Lyjh3^oqyH;si+hp}mi9CFTqsUi=Zo6265oJm!IHQJIFn9! zMR@hW==Sb%=>0v+AKE|I)vmBVW%_tyKJ_P+JOy{nKR#6cPsc9)<`{a7a>zdg%NW+g z3omlE6~Nz4Km?U)^H$cS>CftQhQ)tY=z74uo-)~ZXDURL^^);Nnp@qbCw!LPE+%@9 z%pR<{#{Jx%yBabKxd0O=QAw8l6?FcGubXT?W9p}>+~&@O4rO9Gu=KK;S(L!I!(k0xohX-k8n z_G(^%t@vFVo(Se55wGRUDg^{9>;+M0UI#S>Gg)lDJGdtnkk-2`J+gY*!{z|(F!N>R z@0T1|Z69^>n+o3P8vmNQDcW{B588^nLSn+x4(jZlCuVk=#$2WTH2bjWkAvAry86~G z)j`oL_H=CJ@vWyHuyY|TiFS(BB9DBFzx+k^;XB7fVhUIkCW9>f6DB>bpu6ldTjf~e zRjHR2o#gH12LsBd?0*$MNxS>VC2K^=@&2cxZOfye905-@VCdjK6Ak9L_1eS3@AMW- zqNXuS;bVGRnlQiDEqfnHx-dp+vt7q2cKB8J6VMbNwjxU+CgcLq>vFx9<^F^4>iRx- zaPD|_6W}P@I44cHSJ8nW-t=h8u^Olc1os{tAzZ1IeQu@b&JtC|Ba@UKAD*Rpp0it2 z%g|^D?g1P<*Q(;y4Y(aCEXMxPCp;=66A|1}#xfj>AQ&Jp(v-_{=MRuEbKmB+`Q|~< zpzA7}#G^>tFP#@>m!!BmVJ9pGx)nk@2#tXhL@W=o0pr+d{er?%%6I$ zef#>%n~(Qh!o?d)OsWjY@+O9#E^{jqfA{)>Ab7(RvGA?eRUsr}LXue(ARh7Z@U)HhuM zz2&q&ynS#^U4HrA$*Xl$_7)x@1ee~X_$6>RT!qzNQ+@oSsrjz-_Ej|6o9^Oc{&*^i zdpG9y=qY^0DSr|p=K+lyl+~ruwMCR@v%u|Ym|^m)Y1~`KHDme~<-O!T#8fFLQ+`~d zEL#}5KvZkRYtg)E63tsn(k@OG%&}vsZ1})S$yEIFk)AT(>Bf1DG8xRbpw0%RbLPv& z+k*g%Se3(+Kp?y=sQJRwPG@(C%Z0@_kV5_%rNIALC1_rU^ntD-ogq<)iywjT6`@L; z;X21u4Gv$c#k^gMP%+<4I@(6RPxUR`l{csjRbKgTODe!dozN<4o;!daKliEDriys7 zIhg6gjB}u4Wc*vnZa;+=s9fLQ6is%i@5+bfyBJ!l}5gJxfVel_87z&jGwR?#L3~k~kmi+Qph_EGlLx7<(y= zaFD^`+Yx&80i8L{-$)e%CR&kceNJz?B0aP|$fEfY@!b969u3#alWJnDVwcfUYTANx~* z+RpV3P1Qf_@o83fN!PTydiM!&L}u=pe5w3Nbnk;o4K~H_xX^nWl>B6w^`#NRZX-i8 zOY%b^^2)||Ub}q5yZrIwgpw(e=Y|&?7wvCG4$lz+%r5c(tpAL8;{cu%{FAIjM)gBh zNxvy@!hq9io4^9}!ca91odLvhP9L7Lx9#X{rK&yKfIL#M(+dv&T(tE+y~{dH;g4;by^<-N31V2=)4Um> zX(b=^I1m^VD*Sq2*${J@bag=LmR6TZVJ7(i*&oKu`ooXa>zCEKB6Zy(y1sMwE}NhzQ4usB zX}Drs8P!XRE9#lV#_qn!wzqVy*wrEZyzLF8oYV0P>N1BzB0qdWi z3B!YH+VK~~oz(@_V)EXvypR6u{dxQyLS0zyNYEex5{|knCW|LpTCgn`oNv4u9QtLn zO>4@FOk=!^Fie!5dDe71pilgxN%%L#{lOGK%WD8gL0u06AwOG|UI0sb1+-6w6R#lw zOZcm}inGg#`eaH?^0i>w#65|68bZqx018?CP4P+xaAjwb_{&ZKpvZr26`B(OcXjjh zE&TtV*GO1wS7EFP(x=jd$3Jl8@xuB(sp9#eM{Y@Gx+D{fLwc1mu%62+O6gz z)QUavAxR3~70Vh0lgM9}Yk>8yrNAAc_Ih8myMA!Lc<|-xZwlj1BNL}i`4f1iGToG8 zJ&*AGw?iS?n1S-3o${KvOp?w?TKIeppI*a64Ty354tnbT5pxgWnp@JRG>0FCdK5ww z4%Ing%JTKSB7J5byCb?C-Zu=lkCe$N)Lg5dEo&4gtzCxFTags;5s|J~2!j(^cQ(hL ztyPZT)!ih@0ulZ2WM7yoVdC0#R%Tt3f@@e3a-N$#=i#PMr~di)Qu*=l@_J0osH=%! zt*e7Gu*3eFIwA@!9U-e9@kO3O?*I`1`#=Oh!Jh{F7t_PxYZjw$94!Qw_b=55QnSx^ z46GXxEffe4_}hx5QlWXwI{o_p@ryjWy2e8jvB4=^Z9}UR8~X53O*QY*j~;aXvw5FR zoxDh0^-h~(KJyj!FYAEtd)zZ&H`CM%y>mhxNE0T|kk)mZSuNoC!G(FhE#NQts0gmd&E#NjG{^e0fO8>Xd zOK)^O36gG*q8a-Tw?Eg>xKv~QSn4Le=b<}Q(x6gIxIW$T&vzaeot&$kt4yE1-7PUO z3KX`rE+go>$EEGJlP`{y&F()MkydizTVBpYUsFFxz9Ukg$#{O?R(>;64p+NOhpy3| zwUD4~=&|##q`;CSg{u2a@dQJ(GuS(ZR|+&?LK8;)}k&;$z-ciD5b=VEUXpyu^jOwE0o*G=5g%1{iX|VX64*ZR$L$cn==}R>(X+eo2W>0!0ipP%x#r;2PU;I*GsRmm2a}0hDUb z@+P~UuXoN(+|MK}_LfyVLz2wS2Rf^U87Nr>DH~O zdqAT16ZvxIGsTLQ@mC5T^n;-SnC8{{(s)k;(<8H$p>+t&H@0}ytS74f${l2orwLf z$5Sn~Nca;xB0vl^2=04@f*%FnCEckFc=%LzpC9f|73VReH?eNdc0ObA$s4cbm6pb> ztTTo<8PFFK<6YNWE63A&BwKuLTCNZgisM-{8D^n$?HkZm^80QNYP!<#qMyqBQM<42 z0nMwOp~$^0ZcQ~%tU<|?dZTY6#?7rDp*!rOwsq=fOIt+78vKHaq|+QbSWXu3*cvKy zQH1@bcs723d7W|Ffl7Xm(jGkjmXZ3(#hXpARiV-pl=0GWo`8SPbJX$Ks<`N7F|lK- zh9Id-yig}&!T?rTUlS+vL(AiNbq|`U*hTS+X^AflzN4&8q@9g7(e}kXmYImYyA^h{ z@0)B21cEefV+F z#8^_lN+l;;?56gkW**3SHaDf0y*`wiqwe45KBM9kXUn=V>FF0&>nOq6jfl!sL>-nX z%_G}6gI{V7i(3vZnvDAh)FuRslX|Wu4HYY_-Q)F=n%(2#xe%|g;xjEEoVag6h5}98Y+|SI+*{#h=C%FP#p73HLG6H(Bg{N)W9U72Bsad*6 zkk9iPT>E0}M{FYdpMh=B^CeAu*iCbl3GUKqG2BdhsE+!L`J|p7(|5`u=3nJHCaGPH zNKm3NLyx$lYBEDy$(&W(2q{jVIb=++JRYyv0lIVmi9iLC>z+>b6m&X! zU9n*ew@)f?u)ZI~S)s7|Dp~RCw2WTux!fnF4R5%Q%WJEgt!%80VFSYR*Vek4ik|Jl zH%qu9x9RaT?JCm4Bw>8sf+}tR87*|ZmX**DW}_h>#$rHl=eD?~^oo5@@~4k{5KVNu zafki;M5p!AYQV9?tZsRqv(>Of&!~MU6H{uw+ad{jGNQt`W{Ns$2p%jc*+@A{(HY%_ zqihD&6yJVGr4H4zP^Eq=+S9WaSl?`Rv4*YF)-XumEHE(pm$le8LuVy{#;t(C0^o^x zhEuN&5g=ya7=q>6C2n3S-|~5|BUr5=Plt~?>Yn{|S9Q?8!U4g!{C!H1jSDJc8{L_I z20FIhNT3Zj>(MA%UMYJy-lUJWYPaRUsi${P*N)f3i$2=3oI4f4Y+r5YEl90>=J{%< zz+e9P=N2IrF-{k$_}%`k>V5h3g;m46f={mXzyw#jwLd5M4KxZw1m=OZHUWa%_#1Se zfFRQVjl?>6poBDXI;4uD>9A44A|iKvJQ&wc29uT^5?-GgcBtJ*dHU7oez#ZrB9foU zRs$lQYi23EDc`$6-fXBLAb&2G-CJ176GY^x*b*ynE~`H{ie7=AV0T$;h*@16t8vdq zPYqIIy1rfL?YtJuqCqo8am%1Hc+w=h8>G(^(LQFp*7mN9r*3ENaMd@6?YUtY``DX+ zr=PYGtIBULG}fg}u1NA5@oC=ViOYZdnrOC?l=9jzVGU0qXl?yCKktvl>foI;vU1pz;3xtk&bWcSw z7Ur-=#JWPFnio+6V-zhmV*dW>wVtWF}BeFypmelRIn*Sa=+EcNXn&=JB#l z(%slufn=(zJ+zFD>0eAbSPjJ-`4VyuNPS(32O2!pm#VGI6>|Q zGjIulHkpsAVnVbKUF~A{sLWSdIIPibz0pEP3gNOT6{@rq9q#_!Az=`TwUGl-RM(O zn*gK+|HVN4Dm9SQz+$2a1cFj{f4JoIpPwwD>P9*yqVpCax8(q@Oh=Na|KU5}XrN@o z0SpxrQa`n>D9qG8sBLuD`1Nb1=KHcHV6!Uy@8TQ({EG&YH3O{efzKO;qH1EfCGKMDP4(QeJw3<1^ol|b8*6(=$lw?(q98F;9j9sdDd#Zl~J*+fB;FThQ!I1m>u?wk393uEJDDoAepdMg-O?y>#o}N~&)B zV3}t6mP8U74OK$_8~F!g4q&mQA2-MPo5C^iwcVmzTd;)ML>`Xt@!aN#ZDT z5n}s}^3bf^+)(+oj%<272sYBv0^rm)Yb8ZOAmJ83!^wk+vH;_eAx@3}y`fm;*@YD5^p#t-on%Ig3NAPc;Q;nW?o|*k=bJ)Zvx_ym2nskX@k;#vA3_sYt(Sajv!_}(!Sho zF!mraPBw?!NGatoH{Vr&Lo;wZEJhIwwQ}Ik&C|+)WcS^@Sg6!^lOy)F(!x;aA-Df0 zftL8Eu&C%C4isVrWHA#s$06P%FRZ<{=s_9e4;X`Px+xYso5paIYOz7eYC`nbAQWIs zibw4bkiW}m2g*ws*~zV_wH$CqGGL{uL%&1>PnJSxAIFuVAA3qKTdZ3^SA#0Xb|Ef{ ziJv`H3}OpHV+{KWlXZ?2jP3Vyb(S9<{TUXk*#wlw$E!dj$!l4_SE+O@#NY>X&k(@E zyO4(PzcCUqv4`&%BirJ>W60WK9%r!hl_w!%fJc}b7#L7=ho4x`zg(T$lPCcWdZS63 z{(#~|z?PYeg|C;qS7ATPBs2O){%E+pjf$|3Ae`#dW?B_{6fv%VwJm;#8!7NFx+|CF zMrgh&i+qzU@b$km#89>2suuyg3k?P!{4aWH6Mm%%kxWYUfDRJSGRx&4(p@*>^k z8X7My(uBS2^ukKT7`rrF%M@BFE#eINztyZS6!$Io2)KJl*VegK&#qe; z|A3)3mG_-ThQ`1MK^}Q?XU?L6_59TCQ9v%=0+$}20=cVC zM5E~o0t~Hp6`VO7fkKt9H*lGtws{L|*7kMFU=X}}biT_SuqnD`=GCe2>>FMOZc2U% zGe4;Ie45{-dkyaj*qZYv0WrR<r9v9v*}!rx-uuFM8!xA$>`zkj`Btu^z>@ZN^{b&6dpSL4XSus)1YNhNR( z0hlFT9Rh62{-hPigB^7JU5;qH6Y(a%6m*+TI~#qXaIW zv1#238X?b&=KRu0Z9yS_)M1JV@w8PUkXkcOufy#*;3{qpj`l!-J9xy&g;=bTXWyZV zbacJwyb%B?kwD0wnZ&^Lpq0(k@5^$7i5ic+NVQXoxA!g)T))cMNv!ID5KH-+lvM(Vg7D|Z;L$rdUOLJx60Sc^tZ&{k~H&( zDi50Q{7niS&uxqMp@@tIrPUykDokJLikI@xHkHX1YFP`(%+b2;Dy<(?0gY#5YR}Y;CoSc~fT8uq& zRevrnA>^QO6Q+a=%I?KOvC=smI@Fq_)ljD~S5~w26sJqDjFi)2(9)~1#joF%qK+hK zo!zA@wU!Pvn;?=HWzM3`$v5u3d!z9!dX_xrn_AI9;!3_x@Zzt@Z0P1Yr}uSOqIR=4 zEykLq`SXh;ulVXneeb#;cdRzv*Kd`!NmYXo_ zyH9S)ngM67!0)HLxPrYD^n};3Za}I!=V5kguuECGQO)6ct8zbLk)Fd}d|{yMfRG}8 zP-(dlOSA1Ly4X8;%o{<)b3R{NRpt_E& z{IS<4Cl5Al7c*@iQvD6mM}_*MZ)0hPPQgm)Whz%P%{)dXpfY2G@yMw}-&5rgyL%FQ|L^3DW#LbLU%_2~UD0 zj(+!=QN0lVEo*TpFDc>JYM!UIe1G}%OF{DB{N6h)dUB^Q@{CIqF3td;j?_rP`}PM% zuYnTjdI>;X1A5na`VjS!ReJQlWs1pm%KVYfe6-&siFb zU$98ggR|jO9G7XKznF!5->H4Lq08?v?+*~a>i4_&(y(`rR#Rt#A(SGPyv^WanC zTiM_;xN{+p5s|}`R!Ot@XdXMMKC1V;2}MTFcnog@GT9`4=VHsJ<#q}0SgfdNiJ5VE z(NLahkk~kPawLA0+9ch*%*Z0P#3g9DXahpg%1#{ZnA(jd^T8iQjSN}glYe+-$xP@9 znLfy<&6PVgZFP2k+#Z^kKoy$3V&eQrNRE?fR$l#dNc+Xfblvfm$D;6P=5y7QVx^$$ zTR5Eu#HLdJeQZn!1u;6`-#*U$o}PBU5$TpL2^v zX9lnc>p6-t=e%FDUVR`{6l5Y(R#3YY7#@KGV>9+H}I{)vas5r<%l$N>jtMlwJ#&L@~0qu&p4u-h?*CZ~Nibr;^=%AMVWa1{p3D zHAk=)!OftfWcEeCE8j4_Jbyqg52EU(8s#DCFH%j&1jjh|wZ0&Eou+(a+s(9@XGLcA zG0r_#ncN)$4{*xC8h2gYdb)CvkWw4&(FD34X5gL<%bz zJ6c*HzL!b)9Iye{{PM`eU0AUY6v$hZ>_foZ4(DkhTb@;9pIuz`FVx8)Pn9T8lmP9F zpFVq3AN;{x$S7rgJnAhUkp-s;l{8IaMAuXc^bRj5%140r&=ow%iu5@e_2rhm(fPZrpGPWcG20^*SjnB z7fo|4!~Ntybl)hZngKaPS%$S0f9&)ifm;g9|C2&A+4peyi-Qa89nPy@93e) zBSJWk5DGa@d`;eMw-T>S5AY6ZP~5GMGo?>sBVSD#41CAR`R?M0(_wBWo05gE&?UpX z5j``g3wZw6WH&QKmPTltrG>j?&PhIANB1R9i`kD*92 zdupn>`g{;`MgW=)TvgwmXB*PeF)(Q>P7O2&_Y&sqrh5bC?NxKe$Ju>uB1_Po$wS>| zz?jw%2IU6736U^e#9_X&->Gn`li2lIy@=1sTvYC^iBs3IjlXmlX`W3&ANiU=>nE-T z>rd{N?FOv{rxwiCekwGZZz-)E@S=Y5Jk1@?{M7gBma2PjR|rgxd(idL+#q_#U(cfK z^T7QZ_c#T5;*W360UxHXmv_lgPKTc2$85l6s|WmJy$w)isf>e7LV6&3PJny|=*1fW zmG0(NHuWe$jnz)Owk`%|=2MRJ` zTOB`>5`c#}jdH$tsXPW4d@ploRSzGOJl=LLeHJ%0b+P-J;QI%k1-C#T`h1Y--^^@_Y_$>mNjfq6YkWZeu-0-b`PWZCmCY|*CLjt3m5oqxoC(!)+ z(Tt&;c7hBvy&)RpjCxiCXB|T)tR|=g|?=A#K(B;ps1s;N#(Yl(pLGye|XG{-P1Q@S+KmbD8*gbc>rG zN609d<|MC>M-(D|gLYHTJ-uN3x|IbkO>~|L1(2l(%sdSs#*x>AOSwpJ@HKd!6MlTt zkXChNX}|X`3QA?+RPrUS1qC+l)pl@oYLCkWd+Ygn{KFs$?h*M_;NL+ieJ-b(d7g6u! zKIU06*sa;l5=WkgKO-9LBmzDwHRun@eWdSB2MJQv5N+@ri{h_+K71lV79lJS0CPEq zm;L-eT&I3H340KJJK%q0ZF2BPByR37}}(BgoMC|?5PY2Qz6@<64^ z{JE!8`lq&5aJ3fV%GC)_gMbi+7b>L6}V^=!@ydjfl^~%wtg8YTfnlYwD)|-Y{ zU+zgi1o0wL|IW8YQ(GHwR94Oqa@7kfr}EY&Zc$?#wyDFsdLhRiC*G#EqKliW19$CC zvD`yjvsL2e!=IyUIX}7r?{Aa8Uka(0}7VKbg)-%QfA%V%Oaj z9PZiN?Vg)<>KCb<5&9-oGI4j_XW#w{aktBNPVOv2T8C+Bo94?}E37#4P3rZ>IXD(t z2=yV}KPBo;m)B7=eo9~GDl@MpL=i7ve!{%nCYRLHP?pPeHpP%GSxjQAnAVCZ1v zDr3e66ZYGKsPyf-GW7ucJf^+1_S_<=!drK5X{qlrmo{WiZ`uZ~b|~mjt>v88&c}6A znZsh0gT@^>rFH?~_nT7rdb=ri5QqOF)Alo)2Pu>}7pomn3e(6OmHlk~k~Ew8++X%X zb1{otOrJmbxEhkm6psAR`n|yF+8-Zl1f|z=0|9+jzrf(}rM+H{V8v(;-iE1Z!q+tr z5w>z1Izu(CqBUgz}+Wz}7vUuA&q4qoS^!j}l*^V<~%Xe>mMAb|`%ni4cM%9rR>R*_AYYifZICG$CHE*_P%mC+y~Gc;qq!8)#YevkjcOuk(@7%DuXf5!qWzrm(0!s` zkO`0XbHd-N<%f*sT&<4fzr&p+oy6iLA)v-z1>*?G&H?yn zSVu+ZA9ntkDlLz0J(5KUc|kpO;}`Cq-gRgUB0VDJV%e^%wZvOxkKDSrBKKpj3WS)t z;YD8i((CvLTOn{-wpmynoy_0I#qQInP1k1(xjjQe}iP~Fqzq4iVW=9x)EdW+@^Xhdhm{t zSC3h6T>e56}S zx~6?m5&iyEyNItY`(+?=lssp@#;KlTfGJllOM9R=Q!Z<6DX*(z6*ss!IQ>oyFMvI2 zgJ}kdM`bfaTj0*h`#s9n#Zw-h7Tc=CWQVR?N>JlHOpfbb_R#HCNvF|vAfra4QZKu& zvKu5@fji~p-db68e;#g`7bbao)!BxJjI)^eMFB332ZZ(oEt}%almrzaeB-U_Nuk?q z`JLBd;i9)5t=|%FD1e3;l-LE#$s{{W~4DXA#BsQ$aH4HcW#K6C#!v& z%~{d%CjkbaU&bYn`*_b=9#T`A7emxO6X(0wBi>X;9*WAg2Np)!1%RcI<93GnWN1~Y5R4rl|?t%5z)#w z;ZNMcLvEnLPfRcYH}4tiXqrUYCO2Jaq6xHawWs+o7Uo1m+41LK`fBaBj#n>YD%gBY z--`qb%JGl5nZ@NLXBxlQNe#*p-|7N%4E-JH_7P%m3S>ZF%K(7f*z7!?Teb7-c=P)s zi<9(??=UP~P&ZXr>F za}gZ!i=ch&yRA$wbQt{l`&}B-3>HK-EWosB50od3R_i3{MAPS8o4D+|9FfX0qz5%b z;oN8iOZo0WC40TG7j@SxR7v{Nd7_9^H^*jMin!=+R@0sn!W;@rYzNu=oQaMl)BJ8%Ur?K|o<+RgfL!xN1W!siohj{vSmH|ffP!TpWOF4*j_!IcHjdqtSzfuj zQQO_!EkO(Bs0Ouk8!yN{bBlcuE*9d9nPyuZ$_fHB{f0T&)(+|*y3wB z39BK;9!nq%rQpahn|K+z+q2pL!94@ag*Z_R@;){^9}5F9u|V3Ami|Fz5j6;pm{uce2>|Eq z>}?~GoY(?VJ7qxh=E83P^ko$-V9zLgk9+bzVV3`9Yx6(1EK)2XYT`*5tI?>Pmjq@7U1V`ELctiy_5ja1CKVs4aS$P8`V{ZF2Bg|m>X$bIgMiRm8V|resSm@i@|wC#Wa-p)HRQodFvXP zH>HhtX>#?dmne`!tklxsf<)m(aFmQOiru+Y7)rd=?C2$Z(C8LHTg3WYxJcqo>WX#S zhu3l<0c_t0`xcv*ll$)4e%(M;s(WdpCtV66w>q&!n5m9>FVA&9K#J{cb#a``3%_&r z*&7);HPl&x;&5c2OBG$L*$efs-biRnarx14_LQ^s;yaIbmpv987OE7}1CoWPlQ6v1 zQ)c2-JWnZ*#Y9F9H*aI1CEsIJB^aP!|Azj!_WX8n*7mN|uJ%Bg@*d1u zaF8=;{mb#rfDSE*mz#^&hW!f|2KO#FmwXN8AllkS(8{sj8<8GLYu+77&2(WMf*Y}B zPH(cgaH10C2I_)#eNW7_PAi411J+byTe&~s6EOlL(Wc86ddS!4w~$*})?DU(lZwAV z3jIdhR#ABA2;0HDS=hM5@;mSCd{GRz?lx^QJqEL{ENtULo1upDth6xD!w| zzcr5M=$as$(09&HQ5vJ+ch5zRIN}`FosaGZ{GqOr5U{#J|7+>WMFOVtg9wk$S2?-% z+>P1owNexf7YwYCxRR@W%dcf9U_>fQ;r>YJf!57oQD^7tnlS&T@QUszCWBxQ)1FW9 zV2irpmtFpy`3%KL^+9dD*_2I*2QL**vGeWE@#-Y8RVeufplNe6jWF(+gzr`dAohlK z%Yi=y<>UvkX~WJ^ip#a#99g)wVTY6RKQ}t8Ly!acJEcBkzEN>$6x~LnOjdDk{_Omu z4($AahUvDge_v@a1)I?0XA<&0T(E%-!&vs8C35?`qp(IDx9o1lrBBlQ<*6#GZkNd4 z0X_K-CLq8?<8>A7n16B5L`vkIs+lYh=C-xj3N)t?4McWqh@*~@BNq=qz`g^@P zdIfeGRn6;Az<>lgK!ndzlllD>I_-_|Zat!!UA6qP81hj4=$nh2@88M>lzneo7n{3|r zi?Un7o55w9(W_8aCl&n`_Ogwc0PUvQ9_BHN>vJ=;tBQ7U0H;axKbvL$n~bu*Zql(C zb2H*vRwK6+pZj21#9RW~scLG*d8=c~7haJ`>UTKCS@#S$xv?*$|E|cBvYUh89_}_Z z0C6(UVQH?skDIjQ3Q4@7$3)1cqoY&6zErd!s1S|u^B9(nV6~2%YL+e}(S-eMu+Ae0 zh9IQvC`QXZZdX}(QoOoi3U-w#KP!IB_}}2c;p3+v{w#}#uAx_wBrYNc=7TiNE~{0C zOUc~Kd>4!^r|@CBv9)EWq(Yv@_9L5`yv0}dz}HAJ09TL4zi+>sa#A1j6df-p2d+Umvu_~b&T;@OC~oFSDaken@~G0ow~OP zc5Xdopy2xpIpGMp5cXfcK`%&(IB-O>q|-g4=*OeatBt_h7sp(cS6<4Sfr#{EY9^ox z1mjr9^)STAIm3;moZW*nAb=+l&qG#+L-P;^^&fmr?vs3h4;)pL|LFqLZj7>&1L_K< z>8lq1^w!9R%wE;of|Lt)NeZS2fNAw$$!41+u;sOo+;<*lB01rA z*raj=%q%vp87*r#TJ5YCt7Pd59qJTP1ASnca(DSvFdAYs0$1TLth-I3oFS6 zvbWL`?U5!VQ#=G1<1x*d=-pP;xh0m*cVSItdu#9YJLag)ir{xX{UQcw;N4!O3~9e|Sui$DdoB2_ckk|(+^~T) zeS=SOrjCGyDYHj$fpe5apChuvnsa3)tHB7_Kg9)p+k76c(gVIwgE;SBW%Ba{p8J!O zs&FLjTc?}q*moC{t+p^6pR*sA7ZaaYiBk$O<#YSF7;f8+ZZH4BOEd!wqwmD@aL3fq zFJFIZU=U(Hu?PC)p?$N#7(Vfrx9{j*UovyeO1N~^3E*1O{lxkzUA&%yXMVY&E~RNHsk z(qqcJ3a6_pIWp|!I5d)`43K@=`&m&Qy+~QH{*478?MsE~8z}%2e)H$aju^I-iV<`B zSHvYXj`RI@O+D59v^0)%40lSrq=T=9!f+ATx(-ou-PDDlx863NHuP&o9p&4VrvvCo z`+{`$K(x#LAg<0A4T~v@w4auMck4DY1aKCG!6y>V0SqJS5(9j*HJg#%IKI=G3R2d0Fa3!Q&I+J!86Blrfq2`QZzzZ0X+J}$UOsj*v(GFurM;X07nnOpjRoR9w;G|+ib$s_xGPjtZtHNppx>Y+FFZ;gseEbkRd;aYZ z+W`oqu(yTU(9kaubL=-7O2d+cEglJ(z(vLF!ap}Sb{vNIYPK}hPCFB|aEsyle1|&_MVhGh8@p)D0z_5cYUb~uH`#wAw!Ifj(3ci@%${x?F)jK zT;4Ukr>PPe+k#~xbLl=+d*W~?5emcWbkO8+tb834DP#!@#=d?c2~v~0zI}Z^bS?0e z*$b2NA9R;m@@jOd?)pQbgxt%OumN}12RsLrfXyaNd5-iLHXjGsX2G_IHiH)-Y(yut zYBr>5-fI2lcXZv;MOu`^8;*!`FI#lZx%*o&NOr3HGA?3b$#pt{?bH^IsVOcr7dQT> z{2^*;q?3L%LnG5}cx@)DP7`Wy2xJ{vQI3;2;L26Ff(9>prP=1u??Vn-!|!%Xo>dmD z1q;^+x9d=b8#qZ#wa$+y!(#WhWUsAC{PF2b40dqGwVi&Q{Mg*4dAx#c56>hNoY;G zc<;EkvJr#vl_|+ph#QSWOy7W#j;Bw}+%=?6-FAtsc8 zv!6nvMWbTkPDWHzhzsmDC_j(`b&Y&^gPR8@{)WVYQRshcgBw50Yue58nLl?&!d=%~ z%#;3}{GMjMtm$b;z%FSDOh!LhxFcGHIQbOcgpSKZ zztt2)YKmtAlqch+i767Y?){0jLiFB@8xv+f;?kK})aw`*?x6dYjstGdr_-?3hDtORynUbnQ?#+0_?)i--fb4px5VbvBM;$Pm z~4UIYuzbIf2-I-j2kXY8+$9r&N%zT6Nn74I%;hxTX{hQWYTKUtN1Z6 zV+f0XRR0rn*w?D2T9hgbR{x6g&eO*W6fs^kK+dJM$&c2iP7$kX>e>^ITj;|xo~8&Q zQnWCs|Hjalz5;MYx?#M{aulHgu{3?zj~K8 zQuJev9T#~yxS`DZ6dH}7QR}qi(UCX$;vMVtpq?dAYeW3X_d9!;Ue()HnXiijy6Hn~ zUY>06d_M-X@2x%kxOm3jppp;iIVOeM4n`I=*H6kf2P}6RT^t)RHPA2!M+jd{b_CwI zYl=~st0v2?AwtcV!76#ZzVapHo#OU)rq!{2JaJRMTU#+Y#C(OF@})~8IvK0gc`=jk0UnMKy0X3Ac} zS?RtFD2Ki&-rQJZlRK5`Z>(~it@d3@tk|~V{OP?*;~lU2cJb6 zLr>(*h*ko>LGXCM06Db2d;T|Q>Ao=;rTH85xF96aco1A3_OF{QZt+tGVDp?u0z4fR zvcLp+#P8!pz*XmVEamnFLJ)LBo$xtOP7D5-M@;$BSruxDpiGVUg(?ALVMUuO^(gYg zan$zwnUKH*d-A4u>PqYe>FM!pK+832`y{e|B z32}q`!ICdC8PN>}neR*%1bJz)q(I?xSF>_8oWM^QK-2tIduglu>8 zkv!1-8#I)LSP&NM zi>Z58MkOW`&53R0d27#x#bS4NkOMg*b97HjBECCyms&t;Qa0wTxrRnIju#JK{;O=3 z5)c!*fRCkeq5-=d{|4$r`+x$x1&}{hE$71pU@SCiw?`UofkuJ(|KkAQzv+Ab%|rfAsH+iMUcg}^ zaY%9(jQ)&K_^4_t-sbO!ga6xlRS`@uIQyu~2M?&h15&Z8AyiHP-ZrBnaN4m}^9!3~ zF!v?iOFSwKx0|Atweg{CAL;%7k(Z$c9jA8YR3YM8Z;+>_SbCt5EP<9oFO|V-=M`@h zc1S9glxTQ}?05Gunb<`NJw6+6SSSJlX2<{jrJ%salNhYY`(0n*fy87FKdD;sO=4B) z*?VqP!FL6ds^9!6&#uQonzrwO?f))Z^8aYA0$e?c^M~678Fn6D<_F)In|1CF2xkhZ zeoV310`b38oF@dI<&v30MnkT&;8E?tdOQ1Cqcv<&B{?-yO4V=WZ$f5Wi}fBpQ!AId z{*q$-@6Jppjwb=_xF{|=3_BE$I(HBpGF2(0Yh_DOK6%2J7$z0(-%o!ZlUkv>UKOeC}UEg;S=0j{S|VI=u3)a5KR z6)03t-Ov?;ETU5ni#nkOwmpD1Y7dat(l+5B#7OjjV?-Jl zM%(!SdVWE2TmNs+>iptgP-9UR#y}ASJ?4Xu??EuBAyh!wl>Qq;lxPGh_awx#<=>$F zU3@A)Cwn9&{2TNUNlM58DljmvA4Li5t+Ws%HcP;T0{Q9*9MuuAAD}JXOl1O|#F(tE z^BeSzo@h7!3J_uNqsX-7DB?2s*IfkZlw|i80^e0lU_ibGJS%S@@D-*aN$h_64LV@} zwtp9lp(M}vElw0Um@M%d1UQN#^SclS!1E>mqVXllzd5IB1Ule9DgOqs0SwW9p3#4v z(SLqM|M{)|=Y915j3cI@5q#)i}%>T~Qt1hOWaxl-0c?*{pffXgHmF z*m{`u(sr)J(OfI~X=G7!7!36~g3?xPbwpN1eJJfKR2)}h+)9bM?OUy9UK>(i92FAv zZ#CWjd%(&+ZBhPpH%;y@h@3~wZU7>OWB!bBd}9f^C)oik>5#P1(BB~ch+^fhXK{$7 zaWb#~{wf1j&d*^Xgy8pE-v5B~gDM@)2$V{fLx!18meK9>tOYGB5#s0LJ{wIqC^pb( zT;Y2(7sC`)xx@)DnGPEN0c#(Htl+b-aO@>Y#dm;O`_4U@KUI5nz`2RSxWxt8Fg~1G z`}U=SEOyv>4fCs=l?EHd`nhiPw9Hk9jh=jvpVU9_j{kFkDLu@q8uy5{^r9Wl^>~$g zoTv`OdH5Yb0s_EH8T@TJ(wU?AW=4BmK2CLxj;waI&dgpu3OeF(x zIK`S<_!4zxAb=Ri6^^M4OIZDt$s{)=WE@4_YFW!)%W74NhTkF%VQ2joEuu@E{3QIL zovAE@bVbjrOSunKZde^p{gI)6)wb{=a#J27+ay%|acv;H(27H4q#XLa%5aMYxi63gd?1t_Rh z*nipxzm)D5?el& zHc#eWebKT(xDcR1Se^(iw65^8(~_&i3AQfBpKaRtvMeTQsM)s|X&okQRc`JRVQ*WKE0*4y znB#7~i_PBb+chd3HAb5&mHUu+fI%7LqD5Ki+h+3uN3O9RUm@fuQ{^g))5I=b}cxRjR06nW4Yz+!1ss{WD# zqE|M}^-HLS^$T{IatFE=@T}BB^iiR2CN|UyTA%#Di^n%JElfNn>0)DLpCmk*dvBp# z!h?MB=Cm^iTZxSM6h2Zta%dl7`tC(qpBqI)*28KS{*mB}X}pNPS63@5J~(&YA8J)O ze*JCjW5OuauU~_#bOwtP#o;$o_!Y-&^O|EMMRLK3V+IumcG^0IKhhibcK0L>$OyJ(v^+L>tjzP1bv>80;u+AX37ojr{d|$-u9m5S zQXt__N!~&!9bKj!rl>fxp_VP2yvagHJ%STN*T^EcIn-zHHAnXGvhEbhL|^b#P}+

}UUJ zwdU3!(-gUzjd~-pDLt2lhI)>?EQW5kPs9|Be#xlXTnaKE8Y}c~=yhVhR>evS&ek`H z8lAQJ-i@DBAwOn(>M^x!9~KX=k2?ht@a|8=}aRbM!9Bjth3KFNwC{$BigNMEhO|!@0a0)E!F2{v76eZ z&o7KN>uG2qEp-Jt+*mf0d5Jmr^E)-Wl8;Brx2tDm#`>C=A3fj~Qb_27?B?oFzD*4uti<#NW)oM0a#2>l7K2uZ`O7`{$+_GLDS(b~!t zaB@;Qm^d5NS#p;L@@wN?g-GD)$3m_H6yuKU8?Bb|SguE2Qgoe{H#!r$u^s0qUte5< z4Hk(Q>09MU8_Q`;&U*)kTuV{!33ws?Yhf|G^Ji$Lg)$p&cC159P3@-lk=oC^u=5{Y zQt@Vw9~3S+LLb5#7Tsd9Zv(Q=VCvnr#$X>C1lA1) zfD>`!&WZGIlP&f#H|`5}-|=m3(zJI|JK73Rivz@cp_rU-*JeaM+-Y1oI!i>fYc}hE za%*4h9jg%Yvu5tB(<>fUGU7C`-$wjD7<+n*7gXD?-DURafNRBt>qPcpG^E8@`{?JQ zYg2u-PR`a#vd0(VuhFymtEl4dy}ak6DOhfH-IM3VQ`7r96yFDeegWz!(@)eBbrMfV z9SZ%3ihUs>&HQAaF3LMQekHbXX7B%J{HFi2e)+$b<5ZXMi{!_3u>cu_G6>k9q_~I2F8@iRJaiGs<*$j! zd2CvY-zLkt(v#ua} z_q1uLYajh*>@%631sr~)8T$v`(0@JyhGPdrX&0GYjfMa{j$gcW??1ik##m}3l-|`3 zZQ~5R$9?})kle3Kf0hsec7o6=vSCHtOsmIg-qyUeALo6rvCHZ4#xT|pbcxz_QN&Y+ z!|x;v~YTBy!nQoTgek0TIHXZCB5Lh+sI)j zyouVI{x1_zM@2Q_C`5OUvRaP*P<_*#{!PszlP`Th!@S<{OAWxM-G>-?Tk)Z4h*_r( z{pocoZU##}80m@RJdt{-&ek(#4kj}B7^nBFc-wHW<~@Eq*LTyIKc4u$Pqufa3$PC5>c$v%`sZuXeG#yi zC5rwAspPbaB$&*M6xa#wtG;nf4d|6-LMG2ARw<&f@m+|p?LvP6U83Wc8JV($j1PJf zw?u6t)twkGs9N9lw0W{x1t|+KA$45pRW(uD)pwebE<8X>onGYKpH3a`nBv zMUSRjnxqoR`^Wjeju6pPEh#7**a|H*sn~C+qRMi+u2{|p{0aH_N3m+HrTn|e?$xCk z@SBoUyf4QkOEdS9zB9v7X+V|L%Zp1rdkwfW4k*l){{{9NM+cZY5~4YP%8l>%0~FA- zlHLtn0~pKzBUHuW_=ps}0Y@F=14%Lz8)(kp7-(}}jNOcG8cR%LxM+%u1C|r6I(-NgysXlim<*PP5vXIR-R5dkb z^FhPVV)d53?lOYGQ$x?lcW=FD4PZ()bAdbPBhry?%YEv7TGWi=EPeL~S5J%LQ!F32 zU$U@W1ts*n`o>*g8E!%=RovjEVVoGuo%)klE;1^HF+Gn_dk_JJ=yf&y+lj{2E(Ca8CVj53cq5_f znpHP{)WxJ*?YVY=wZre*zfiJC?Kz7odS**9Fx^sbIaCk4iskl+!9;xs;qKkgh!yh2 zPDd{%Wwe%+D5nN)Orum##8l>ZTm(iSs>Y2Hcw(t|*5`!Bug_;2J%0F-j)Ein&bJnAGUpqM4-WpDEV z`}RiHGDPXGE`*;?uya-4fCSaI|Ha;SMm5!L>jpuw(Tj9Yno^Y_orp*m5RhJ?0wN$? zdLUTnO}ccI5~-2i3BC8;A@oiH0s%t2>)U6aefR#(J??V%IOC2xzMmm0i?!ah-uIpJ zna_NlmqgzP{7#&PWmH5XjN?%8w3I2-DbaRiK8I4+_Hf}utGEqR5mlB(>m%cF1HDK9LT_T31_C7O95 zPuukx70&W>S1L~M#PGHQ{2VGD8MLLqvu~d=HJsF^B6jfJ%g(KaX}l2-7+{T%<-$E2 zmDVRdib&7EzT4Uc+_#KoW>iFyvpi1kT@AS;$8>gex%=)@j6wB5eqjc;<7)h~Yu{qj2RPAh6 zv)?sU=N8C+%H=vIj-^MBEj2JiBxHHWKmJ}-QWUj~;NKY)V2>k@HNSl_|53Oy<+!j% zPxpg|^+R%%tPC|{giL=*2X;#uo}%6I;+}AdrAQX-bg24dA;kaHkK*AkN=HwCi7%K9 z8y}s=huif3XU2wTjXb%t3B=2+r>au@!H|88T3rAgRogo!KU`lz&f38I@sCef7=D7H z#>nvXfP~c0;3A$GjR=yqK4UZVfHjk(KJ>7B6YAS9qkS@&(f{}(sc@q>F=(}xhtBI$ zew21r4YGgEixs{qQAoy#xs@%KA781voyF@ZhIm?L6j#f*80mMT{C?U6tUr2K z6P;alUB!XVD=n#L5BjXKvcYvm(Om+Pv!xTzVp$y0{KERSnw|a@a?UPg1^WD9{sAqIC^H4DT33|P_r9XZw?2Lr?lFX0n6#5cD`O49j?SbZKw@<2*wJsEU`O>DfFDk5*GH`v~uw$!C{A$R$m%ICrj`bs;<~Qzp-@sL& zoqFi~Fa-HSRzG0l<$ilxp$hF*qlCcIWVM&uyVcXWBPZ{hjKJHIu1oXu-HTU5uu!R2 zV}q|gik>$G-wo_@eRh`22e6O#oFLqR@HjOp-O-9$6#|I0k@F?7Hm6p;3<4&Gm=?9f zQRv5E-L9mS;$oj0>gveJ<|d#IP&AKUXwT4?UfNtBi+?2!F>OWJP;`HNgzZ^DsiN^J zF?*pcG=?l;G|W=U+qgV#cC>tJ#ibj=EwY$TV>E)E>Av)ZuFE8Wxjm zdG6SukJu8tWe|08sP?M%+QLRlLYhTaXfEWs-+S)Wa7U$VF_@?FcAmCGx!|POS*`OX99Y`8ZQgw=c$JSWIVWB4g9irC@w9WtO??8)tfErUF?n%bT0J89KPM zSfDK@a(%)(`d2?1HoNOBpKJL;xwy<&b+SUl0B^F>*g?8Q*bMJ61%Zv+pG4Z7|W01*Wgt*hR~1fnKotl2cIoRE7HfZ z2&Xs!Fzrb>k4|)hmRoFv4?w#lKAmN;y09GkeZ z`(AIJ72P(}vS0hdvHBag5agp$wW1%oWus-w553ib;6cm8;mvn&3O>>nnFv~6LG9S} z?&wS-pQIvGIvrFKhEX#Fgehszhb{elL31}p5- zCk2J6cFg%~DXBqU6g6GiMRy0Y7^s&t7&#p(am}pZd5JG_&3Tc&zftPl1;+x}o?K*| zoW^Tk`2rMw{9S*7GO`fI#03C!OZy0-drWUu{W2j>ub>yGP*?mrPyz==HzJH#yO!EVT2ug-- zHm)zLMWX8it+QrkjCS`eqi;9Gt2f*qQlbY95P$x=DV47yu&Bt@N-ZKN@e{09MF1_q zdI-tPtj7yLR-NuMRb9=ODo^Ey-Km{rlE+7DZHrfbI8M5%mTjCV=b4F>ov)0WR!BIN zgiBJ!F7=cSuR6idUwx%vgCm>s*qfSoFa!v2w9`=agP9QKGK#K9F z0@Bb3xE^o?S7De_aD{XFDJT+u2_UDCpRO|Ep$H@%1ZxV$0$_}iP9AK*AB;U`ovzY6 z2Pypo5yEgmJ}$sXW%cKQmU}Sj5zYe`@0?L2`e8cGH*#kNj`w2+iWWdSfoxR(K+NIY z1IWaze|vpON4o$?VQ+Vuj6OCVCxsaSRj;Mmf!m1aI#+yka-Az0tWX|`#G~l0nkKE z4KRCt08#;e8Nr)>4B#IF_{SUgS4+(Q{S;=ZXa&EA(3ipj@LZ#=kyG+IquX{Od5ML5 ztcnI>o1$UyVHGKE0ZdlJ1@4xfeTK%2s;SbHVF1vdbmTsdTDJ>qeZHc1FPtNFtO8hK ze1q_#yt*>znK(P>g1f|wT4&>n8*f8E$ADhxf6HV23o`e+LkLsRV$H}4P@y5K z0a+HTnksrX*~wEB7w=_P+<2ujc2#NOGf0D2&JRQgB6z?|^LKDGV9CbOIGCkBJ`b;z z`#fbagM}Lmz44&Xpr4t|JX0Dmj{4CH@-al2UL^zc!IaRwh0>|Fn&z6M-%L~6y#HYY z8u?d7>A&3Eg~IIW(2WY~Ocf&c9in%mYea1&c2d{gcA8G~Nj)JB!*?XH?#O+HiXuzS z^QCpOZR`1bY!=ldwzbJA8#29VlIWHmLK}q%DsCFe$I^^JY3OUu52sm(ZrafvZJTD7|7ZJEZ%>^nuZkQ2Ukfjs?tsua+e z6=%0$^}Tafuop^zzhS{c(FLK+OBJZgt}emJ-TyL4zee9}*dp-y3kih>G+4Tti<`M}c5-s|a*y>+&UTg_j?AF}F;`I}qwhe{d^l9)^W5|+Jbf9y zXRviAuPuSpomY(oO^7&MikciwiyKt*f2?OEtzb|_$E&yn6P-uQMY7-R`Ci3+3AJBK z@~W%+me~-3FUCo=S5ePh6E%3KJ#Yx{7u4i2EP>x%0p8dbNFGk9S<}CG(#5RYArsUm zcSy6X*F22Sc)DC_)84RhQZFdbI^3q6y!s(F9+6R(9<%8I_Al*chHoD(2?m_b2e`s^ z_{sW9zW2t$cfOabK%@1KKi`Gg^>^uv==Y&E7HJLjuW6GdiK!{&?(dD;$AgV}oeUpW z_7kR}Y?~+bR$|ZB&!ScP(;@C^4tAcQvV_>iCGcerF-3G;dxHbhj7rvIyUkE``fYul zdmpbub)O6KU~S01qKBcm)8PGk{gqODeZrDh>`SaiZnSer%9|R1h+Bn`(!;Q}>`flG z2u*tsn+?0($*u_D=kZoy*l03+&^mUvCa6u5@N3Le^}R!$rTxh=4ONXPk-h45PZPlp zjgIYfs0?XK$=K)^SL%dYbrN|JW}S20zTfuVz^)nD0?~&^|7DNpHaU{9Ks#$!q7dCB z2Tr+#{la|V&C5p!#dsFc?@nKIUoFO*A%&OkB^EqJ2Jx&K(C4+~uh5oh&N-|morv@i zE-0jjL1jp5&<^bk0$H2^pDay`6_+tt~6CHPVPJ{`nchA zzWLm?rJhACM?j=kO*yd7GCXqDm5@+InS)%VZ9%T^HUDFc$7w8C+~Q^%?U33d;aV>B zMMiiM$|)9&4W!O}n6aX-_Ga$9R9)0|a+U>--dm)tTk6g`fT6bx(7UPj!AG8lcg>?$ zI;H%=zBPbIon4X+(FR?gDg1%9iRH0s(~@7epuszJRRH^0}IDrv?&Wr|y-6k?2Iy^f*hTT>Wz z$(|jKC_e;0=taeOL_Q37e%(@%uCn!^)3+0yvf1H7`?30nzFwAh31aiivdq}fM*sqz z>tV(_dVef^>Xn`n8?j2rGjbw7rIc=6iW1|cWAVlVs&dl?s@`hy7;`fP9 zFPLODyv5#BXo%I+IQ=L)5=UIbX=2|3^7`fVK({~>n94@~RQg3<-$94pD|={luZtS= zHhp*+Ymv@d`o_%*%$?(Y^bUO^2xs_p9e)#TVJlFFGL>^>R5oU<1HJ1Kj!eG&;rc;o zM5bP18)f0G6rKU>!fphF%mqiQ6M}fxGy5I;NnGWqZpfjl7d~}7DNL+|t-%ntGhFQd zP%DFWZ>e^x`8HjaInMCm5~GyRR$xr;GqGos57?;MiNCC+nFqJ(ys>@~ovc@*r+RMh zHQ`I%xi4+u!t~NgHS4Ogw$}UA94V=+uC#>BVYmI6b?{3&MuP!gO6ybmnk+NxacdXn zUI%wV`CISru*E&Q+)TaWxq3o}stt$-=edIDx;F2f3mx%CL0r$cd(@u<`98ObP{ zQEP>#@9oy3we{_OC?6mL+|Bs0urt|p=jH_;C9To?+D3ust^t#6+ng&>hAJI8mDgR` z3p_pO^%_^c+axOBm;#Z`I_@-HF>%W%V$0 z`yjeaLGAs)}_LX)f_~Ctp_v8&G2-yDKAo3 z3!9x}REn?>-7_c-x*qLlyKa?lcp`c!Ly!L$lDGmZJ_^nURQI1UxYcEv zZB)ECMfKcp+q?c&g-*(V@V%L$ylNH`L%4G3?hl*xd2B{R(<+l3&xC4J4ykfpYYu-+ z_Ta`;VM6BS7}9F&O^xZp%hSCSCJ#yk7RYWIusAB?Y{zmmXlSxy%xu=0o&y92P7SE2 z(iItj3mRlX0hOa1~mcYqpzE-YI+uG6`Eq*{>b)P<^<6?K7!u=6fo)E2_!@e~&aA7Q#I zs>pp1USU^`ohSJBcX7NuhWunZ#OlMo{Rw@5=7oSymL^3BXVdrAb{f_~Z)4F{F3yJM zp?2Q34xf-LWtyy8vOMom=Wsu$uIoa$aEF*(reV)cj_-~MTkIXvTxAoNt|AFG(1F*d zq_TJhk>w3uNF3N(ZO^WnszLm5I&ng`curUbldFb3#QJuY=9~?#LSx6)N5qJur58cRIy>9)A%irn>oZU>VWv?WY+Yd95sTNp`CHw`<45VpMp*$e<`V?RN{tIl&0{gp7X^K`$$IPnSxH!`eSKqtE?|Bi(u74|GGnzCn86K5LL76Rv{%f`Nd@iQez&7L zas2PZXsiSrThKH`eV4a-bWM;XOtKK-f1hh3dT}WMQJ+Q`J6k|RnXmK1uy@ZHCTFr&$wae z=ncB$z|GxZS0;j^!x|%dZxQ1;#?Y$C)yKszw#Eh z&hFbp%Ya0J?NK^5coFgeSu)>37R*9g8nI-AKJs_*w%}0TV9KqeEdB|i%vOsU_;AZl z)(Fda#)ZvTf;9?6a+S-3G~2@ZLpW5J8hNfWK4zpoP=!FRIsPy-BY^5VI9do`IqW6S;16u`Cq3kTp}not=%^NvC&lm#YiGj1;)foCOr*cYPdQ+fkR`tTKS3-85Npfk~KT zdi-4=8_UiwpUF&C+1U+#?DS^zdk%fPlRHHkx$1z+w zUH6dp&K}Fu6)t0sOBSDUqbZ2p#2b!92$sOb+&k>Kz9V1~2Rxh2{54n23u=}HCf2;q zeWcXa@~s&?J9elo!aAOWS6&Yja%v#1{3V!9+yFyAL7a&HrdM&;3_z3vw-EXq0HqWT zlt#IrO2MrAYK!Lp9-Ysp(3l_-& z64IE8sxUx$;lErfX2v>_Q4!Y7r#NS_?WTLv*-&mOmmJzaD0UQ&O@(2Sk8ICl>hEkO zz4r1s4zB0JO5!)?_)eU924?pr)7)r@3RR?(mk&Ra1|A{M_=>YTIQt=BxeAoDYi$oi z)P@r;W=GnHSABPT8me_|pqqXo6VufqPIiY4C@rJa>s*vEqVS!Ii<)FMcQ+Z#3NY{% zu`w~n`|G>>E9AA#H+S>%FUtbA+VRVgCwm8by>}uL`7ZrP5O{K@JYFY-f&kzz>Tu=` zND2ox0yN?3?v;}aFC?|>G!ISF#ZYIBdpH?_&QV&tr&~B7dIlkxH`%Q258%K^ z**K}DDdh8V41S0r>y3WC5O#jJR0r}p{AX|Pzu6TOTwNV3?L=3WH?ET_4t2Aak3Q0h zZb|b4X}lr~#mZx9P>^;6$qcpz#TsS@Rmz;4>x64M5ox!t%&_TM@bcvOj6Wp3D}Uh% zHNnilf93oAZ^k3>8W=|nG}{FR)k{NZhPI(f-Wi?VmU#*^qZt(ZHG=UyU2;s{Z)$2> zbhj+cg3w%biV5}6+mc`PXlK^s@4(RIj+pD+Q$;5<3wrXu@kO$?B2jW4)A$IgHYAoD zz-OK7yU*Bx%9k8q4sCVNe+t+@0n1i0*|8x05%_n9Ms9~GofGw+o&^4J*q_Gkj{*7P z4f$g({Kq=-$6WYhF8oV#VGCwHPqbx5ab7De1Gh1g7=_B+!YUr^?79-ajLL8jC6eIt zeKG+n^w@(4rlR2fEw?r7?K@{r>&T|1#e3JS40RtXvXWev)MZ$CS8S(9mi(y9y`qyf zH*P8Y^KVJUfo3agLcRe>Bg?-z&tQ&i$2c_l-WpdgAJk_Bne#=iL zebIe;7QM6>=#L{_m|m_Q_yG%KxjBSW`ZD#ZfSRIRAT5%)_Bw(wl8xg7%@a82dBAPL z0h|JAhpM&uQWnaAa^s_oqz1y4Km0JYX*Xi=3Ux+Xb0v>5*a)=0HyL_lgmpjT_K?y; zgPZL%A$7xLK1S*X?%W+lIk_WA*$bR=LS{H$b1EqT}S`A4}9_Ax)^ zzaP~TchwVNbum|A*xP_IRJboEx^5KVZ&yq69Mw;1BNTgYHsSt(hB+wyqJ>KAGAX?1 zGgHv->AB$ft;ADxh`4X{ zpZ|d~lE(L@b@~;+xjlW+t<2KZ^Et{+U`ZFSRhV8=W4dh`Cq`Zq#ANaI0SQ(Ub3+YV zgC1@MQ~gk5%#$Lcw*F`nKXz;kUC)Rkyy%lzxIiY(@r2m$@4X2Byvx?#4iUi`nA%sy zgBW$`%kylHx4l5S+*Udav_Y| z*YY00)S4ez)xT+i{)5w;XVcBuM$&FBoo4&U!G-8!K?^bELqlcKz>@3B-XS1N`Xtqn z{V3RSF94t~ksUgO7{Mb}>c$F1Es^zazwFaK3Wvrj#y`AdE}*eZ>n2XWOgQs@UNHWA zJ%kHj>}`Eq3akNgiUL@S>lFHV(JuD0XUEf=l$?4u!cNF_e}WK0z(2l|o;i9Dxb@{f<{WV- zehR#A4F=}>w)!aSM=HR{oJhv2Gor}OLl_+J!A78?u3xZz{8rFNA8EM}>$csr@93ImXiYbs zzea3#Claa$WjsXg(48^v+xAY4Z5@>P$DL`~SKZ&5*ebQga|4Z11W<82-5|d!_(tp3 zjlpD7RddR{du@dl9PbPXI0F_`bow00QjNqJCd+iqG(=3Y@^w#ZjK{MBsGeE|CrFjr zA4$)*&B>hS%95|=VF=rbJXj3}c2nptzpy-(q=b z?8M*M23licz#vQRyrfTRSfKt|4jr>II`(=a;fr9Hh%{H+8pnv6110R~3y^E+!A|RD z*FrUD+i)H}(Jo$gMU_?+i6tox#OEovvdwdyKpNP$suAb9svj%$^RBphw~_TD0~)Zo zyAL8O@;O!u!wOhAFQ|*PO_`9`SLAk!*4$qr>QJq=B4ndzZ0KNHu%bCgEHstQyCKc5 zE%eGMZpd=~s@Soig4HreNialPTDw`vpxOg? zIyJ!1PPW3H+qmU@_wrswSO7)qoa*rzP^>@2)qJp7mAJ$% z8*cDz&~3#V=&JjHjZ1j)51ozoZ%-z)OOf|!nJ(;9^hJikblfhUacg){&s1IyTVtMy zGa9O=R{3h$>6B}f99OHyNVO8>HysugG|is|mZlM=8*XdAw7AFVeGX9XrMWNbwW4?I zZ|c6t&@id3Gb=Lw;k#eFb>Jh*q+g|C+h1%ugvb89^eDuQUT>sm9V)_{tcKD_8IDdpd)}?oQ1t#nBu? zB|>xFc49n2^j1T|5xLvmbPUlM{yE~i`Cft_z?P7A^`xWE$N8m&xxV<=H!db?S9$wI zkjlRXb1f*!@wVGwil91)OLwZZ)A=CEdA?G3Zhn(?3O(t9Nl$uYsa4A!J=RzsZPnaQ z0GYBjf`>aK4Qy6u1sXphVIawE$a!JO>=Ir{$ai@<(7ES6Pb%~!zl35ml4q;ttQ4Pb zMogxWX{GnII%K+P6;hhsY--yGyDZ9Y-U?%e8Wzr(sM*?)rn^YWTY27zqYwNjocgVB z!Rnpg5yBkTHUUSZr*?TQ%)W8@eRH&K9t&HZYKo4Ii?-&8{WL7F z-@f8rld2ly?N-zK!#}FHau&c-N=pv!x21V8Hw@7B!Kf{agVVR}Y`)@olUs37`b3}J z&QM3^UqVGiH9jpm;Zh5ZXTM0_mua8TYe_RazPGV@d#vd>kHnCAr=1V7C+4UhyrS+DtBm6t$IIu_F2#_keJF|2G)W~LJqb@2C=YUfW|s8H1aW84{lx{ zQyz8kqb1eUMWP}Sj^uac4?dCu?!)wKESO&|t$!Q3vY4K!`VPpFo+s8*+OksPp0mZh z9`6`_P1Dr zVc}{CkBv9JSZhYKBSRg%}JU}0Q(W%F9sWej-h2% z(R8{jCM?O5hDG>MklWLL)7Gc2Gr~6XX1Y#qh128p=jcU+0Acnx1-^&hRw9T&&gvjf zvhA!hZ4dBK8P*eqxzXs$I`+m}UCJO`DSrOxBT9Pq2nj-7#hy&+L?MC>3Ff=X#HGKB zasE2P2TUIS?S*-B`D>~rrOFl$9#&~p$!!wsD8%g=rNo+04gRSKqWzjKE3 z`w_yaP~I8p<*y5Ak)P?F3+1nJ2<67<^N4}G{CsW~*R}R@vZ!rL_z?a7y~%`OSep=} z09HaAD9gTiBPq-)J%I9QX$U>}gT)7O2h?B8N0zSh4mH0l`(7zod7HCad%SKBZfA!T z{z@0Dlf|QE&7A59muSKnV8s^p!i>}llNV#F1MFmVMB@%)zeNX2hj=GUwIsC|Czh5|ZN|TkDH8@=GP&_bpV6pY$h<8J9)@|9WUx~fL`{Als zw9@Q@D}9$pc^`9Azns{9WbJ*sZ%?=D=&(KwVV|Ylb5Aroqbofw0-zJvSC09zK|#ap z*pHEUBoC#M^~D-$OsB5UKkd-uZ)jWU1TjUsm}ye9#Y2*N7wYDWWhA8_^2tmYN|fjE z@#VAh(EuEOX;ON?!)lQ%QiS=wyOob*yZ$v$zvaa%?Psl#b#Gt=*u>1OLAZoq=@Me? z!~nS?8EM$YfBfnYLEynCk99&BgNd>3J6vrY9a@E!2b?e6t=wKNagvp!*kF=2$DS zp$l6iZjfs>PFoD}COKSVJC|>^U7jJmq*Yw-_M_ung2`X5WPiJ^{d+zUT11PRTU$nH zQK$*uYj`3V?R^!rD$9iJMx{qUWl#H?U{oG1CWCqa2Y*ejI>7t6^JQ!5QjWr(pilR9 z-3ezX2}TBfbtQiyF${P#;-zskn#7DmBlWF$KK^?yfVE-Gf!I!GTaS4}c1`iT-QhRy zYc6lP96`sC?lk9MFr+g9)dk0xFL{{&G%EMKqHf~iiH zraatnpVDzLxMqOpT$X?XBzOt;(*f_%uao^AzBhNIrSOk6@hbS|N3f}GJgk=BMDr)8 zW$?FOlOLrq1KccbfQ|#m)}MiQEPsN)b08cXU`)R%cbQ`i^8U@4D257%L${21ime1WVBhUW^kl0jhuOVN8L z_rNFICxL%^+X^LVXq@~X_x#6m{^Q;LV?O_W4HM2kj~rio4XX$YOjx|OKD0^PBvv!6 zK6A@hb8hDhD}|e@Z_)2wv!yJ$wxzdPJZ)fvS0>a8cfc~%FaSw>k`Ty)&5h_morV30^?YkfJ^(}6E&I?f&;V9$O^w;UB0 zIKpt4i_vp1k}l34+Sk(DQxVQKjNno2gGFr~9iffnm<#eLny=qxj}RZ>`TExHGbxBL z?CQyhN^h0QY)NqW>?G&S-v=_l0R9hxt}DK*Gf;u*;t@-%A=p2w^bDHu#DT1=M(JoU z^6ukjOT8O4?!8+Nn05aito4^oomJG-ceHvh1ZF!cb!3l~KzWl&Zyoubr_|Hpd+JGR(PB;EJe&QGkt3F)_OhdI#wk0czH$r9 z>|9~E&0LhG!r|xT`pYqe5(7p>SAlRM@;6&V6H(;(1)f7^&Er#F#J*LF2T%~>s23ap zpPD?DJZHm8T?MVl-t=gM5~4`tTZgP&Y$!ImB00z_^?6L6e$-^L?>xFxRrD!6XCUq+ z!alI9c~YwXfXC4J+viPFuF=sU`*|tjkvbHFe9EJu*$5JxPZLd|JtUQ>$QC>IIZk|+ zRPLB2o2Ar=IzvxN#bl+|c{5Q%viwqcF`S$BY;tO=b2oy!3BoKbv$z**glx8zy0Ta~ zggZ2bM?H2Yx#sVxHD1L&sqXmvz{ViOuFHA zINeV^4?o00cw_8LIqIaRAj03iS4HnO&Fy=;m3w|sG2@oF7yUVodm4^1b&QF9npCtf z4rkEK(x6;bDNXm5!CR$6?_yU_{UNe!>FsZPQTl4^RECr$gzE7Bu`pZogAj$1t| zuaCvc=5I6_Ct!euuyAbxqeN)#VN4v@_QE15uUe&0#ZXp%cJElY5Q7L|L+ksCf@w^q zWO|G&F1h*B@rn~UUfCF{anc>54a*bt=1rH1wsh$`xM~RH?Anwv-44WpTO6^-rc+1= z|4!PRjP{sRR`3MvgJy9KS9m~Sk%+k=uM2Nfie+&p=I}dJf-sz$v9SCOR0&g7(VTwC zMpme{D1M(A+%-$|nkKy`j-8G>;@W$xp2v)KA=5EhYw;}5x^16k z^w5XgT2a2C_qnN6p80!a1#WvV_F?G7rJo=Ys4^Po%Z{#_R^jFs*(rq z-;rrwW_zdLHSySz&lPN|uz^MBpj>uY+a2fbEfir&3*L<7s#W{wFZr;*l}x$+)8*q& zn+c7pS`Ocv?wW}>wdqEVT*favnPF#6I@%iK&ZZRgw9yDN>a5Zvh<8yNjuNwqz;L4u z(?brsZN&w4xzg^dJ+cfVa?T(~jFj6lkuVeR*cK|#;+Q=;JW1DVJ0WEZC`nwws;D&a z2<%(eX**aXEWDZHA1xFiyJ~B+lj&4@YM<`nPT6rq(PQZ)!FRc~2Yr_d^VO9U!#0)P zB%ej0Ea{ZZ53bUwhJPDLd9hQM#gSSuu0w?)3v>v@%grulNVB&(lG=(Q&eUtHvR z!zeFI(G-z+kkrf%lp+27J+s_>>WpMG1UtVu;$0#9&G%8H#uA2g=Jd7}@q0xR3_#`Bjw zz$n1N3#yf~QBb^XZ2akq|JjA@E61}H)+HPLnBPlIboFhlJvb3B0mD^*K9&8Rf*B5_zZNf!2^)srhyjG!35 z_Uy|#@4R%HUJfFRHxHkoWcv|c(mWf=oO_UG#)Lru+@z@~Uuw;!3heUKyt79KtrlTz z99Wn9_0x+LA6&uYkga>&9QTh{MLtE+H-Xd~RhEjAYJDva?=&U7kBR$XrpP4N*dKg% z6cf|Z#$m_ysoF86l#yTUsE`-eheY1waixVlx#Xq&~xUyd%@G469n?H z<)JSyDq|-OOw!EL=tfJG2wZMhxEps*j zo3({d=knB4Lrqp?Y}k(0Fq|WecpP+OUNrKZ`aBB~y*YVME7oi`!hrOCF+Kj)SPAO6)JohQ+Tnve1`Q zr}27wax;WJf06L~iCfM|(Sf6e&zE^1Nt-0q+nfgBH%OfLgbG(W$0Q>{0S=>@Sq%?1Gd4AcZhS} z1bo!vb1p;**!0h!j5OqE5ZEYHut=a1W}o*Hl)3kS0^c&f|7!j+Z2QSD?1$wtKf81WpsJvBFS!#B;DZITeQL7tR?-+cJe=bNSo4) zzu(x{e1~n}9Z#-yD!ublb=iJJ35C`v)z-9>q<5-bdKF){zSp${=S|vBDB_JfZ5^X? z>j|b*qX`3%-3X>C2p^R-l^d|w(DS{B90tL~sMB540+i^QyJb+>16^|C$1 z@`p7#S7P4&0$WKM>sAdXYnfyt@WT{N8n;q`u}859#)HOB?MPg`*>i4rhQAb`>c0}U zdvDE6f@!JFcAMu6Y}<9>J*|m+QN`KaYj>?I>LxH8|DymL^LY}|(jkDgIQwm(hYRXM z^+ z%`}A=sXJWyV)Jd-uwz~KWtg~yz=r^WxAcT;A)tN4r1_BrI$^sCy^fWCgiJ6RJCuqa zk(OU%{B|^JbDXatBx@L?+j%uPLP&)(G0{&Z{&!`pzeW`Q5@7;=Bz!lFzYLWe!V7Q0 zSo2F0huf0c!jecy6Pk@WRh9NJi8%^yWtJaka z?*T?3NBasFJ07%Mp;tBpNi%1@kF)_{kdurk*0#v^tRxP#{FweFlKvJuwhn67^$bNFTOP38wE;(dYaI{{=P*w$}!vd;=;g(~=; zAk9(0Up$f$yqwT_ssxl!g99O%tnJSq(ED-^EMUX}4mC+<&AUdnS4+*;nXj&)`)qlB z#Cd6F9$6!5_qs5{-Sd#z6DCnyQ`2CTC*UdVoqmIs3e)O)t`zzdAMan@hizJzL-=$ZE4z}+?RuM7HwZ=*JTrn6WH}O7-VF{CYkMp3^MLD6H4e)9ftiJQTbFYX;i@ z4o5bh6&#qr6KS`e`I-*XMtwT<>%Y@~E+l6r0*Ek895jz_pu5))C+8jK*GK@aL5tj% zWJ?D_v7J}PrTo+hwpl<+r-MpbJ7(rR+Pl?)={+2LK(qUDepl3otk;7KuWG`I9pBVM z>)=(mKOncM07}W{tNDoYt^}-BBqOX-Aspb^MdNw|r(k$xt6aHWBp)-bHrlC?+#xCFhqTT+^|zOIQtXM9EZvDAn!5urXdJ%Q`hCh%G8CL|o5u)ss;k_^Jk$oR5E zwaXZ6sG{9#XX6m_YPiD@I4WEoU~9Cyz!rOn+mYx14;v|X#n8gLR;#3in}CzVO%93? z!UJ=*@LG$thaNN2NoA60eW}}}HNLMmdhg=I8hrZ&TT!wV!A{!WW@gM8e}u=JSbH1J z14=Wb9%pSgWce^Rn$m+3?jKjL1wQPj`Ei-t)VO?zKK9uKqNp?w>bobsjNwb6+PtTY zi>Yg*b>lw<7oFne;)lA?wer!eP5$2SOMBg%K2+{*$Cfp-))~dCZ(!s=wd|n4R;>Oi zCdHQm9g*;6?EW}~VKBVLwQ+bmJ7LQ;*S*ADwPWv1XRR(ey}=7#hhc0}UyR&dIKIDs z&GK#S_s@J*tSx;`(HcS$TV~;6#sbZK**tKWG9%-a<8=1m07z?9J>!+==krjf36!gdW=}b9CLKsdzMH3j zY87r8?KRH_X*Eczqk?HH(=JXWIHpDWfj~c`L6=<@oOuzvZDV8kJ zqGbxcMMd+z8n{gq9pph$>9@V8G}ZWtW5CDGs=y` zsXi&=n9@$?iG_2!f34P29k*2)F2MW*g+i^+4yd`f&q@ffU>u}iUNk&zr?t|wb>ic+ zkm6IJ+eA^(S2{WhXI^sE8~TTf3V21ksUrqX=2*Lg`h@!O%sG`dv66S!QgrIOZ71!A z?IvI4myPs|LvI2nZ!)8)$2?A_d5$4EI*Qi(_Ud3G_j@Ou{3@TbVDCpEMH=S^r%bu1 z5G^lpNIg_?;3l8!p|i;1$a0aD!(AqwioEheSlRNA_XO1=^l=7a0ojy_iBu z$8J||&AaYFC*f!E*rhgBl3Ef}ufD>l%E+ZWyNgTtsu!n%EUdIErR0T3Ke3nDrYvqP zIR04EFgoDC{eUGlqvn+2F#@(B&5Jqsy~)m2)3eB=J5uxd)Tg*E$@F)ySIGsWFj^Ii zn-sqxn{DQxUFk$?snx~W^t&<9l9q8*yIy0icdJrzs{87)Z1*z@Hegs{vl~(i)dt^I zy7^tcPKD<2M5g{&T`_hqjvoFxu`7E8dvvg<)LbCea+;#m4qa6i_+Wn3B0OZ5upAZ8 z{(7JIxh&CCiGp)4Z_nM_zLGS|-w)m&ch*lyHStV2;> zjiIg>vbI^6cIk*T1JFGj@Mc1!J26u&vMAq$RrJv_ft{0?E8?u)1<<km{QTw9216*l~q>a)f^xCRLMNULzZ5m zDoM1q`XzA``HM!O?#?BGs?KRM&Z}EuJvWWgqTqAmFoE_B!+1kk0+i~B+r6r>R#9}0 zvjVSAcJykomxH&prHjtuQC^OR%hLRN98iC}9AB3|VeO^fcrDvdWP7NCEArL4kWl8q z$IFKWE(|eWJ`ELg-nrenjmWE(#*;S<^i(P_C!D`b<%#1_!bWQyn|JaakM;74RSjcp zkHXMyWsAJ9tF~UUHK=gyojCrlWB-f2?~H14-PR3)ASgtn7YPbT2kD*I=mLs#gosKP zkRBiqLFr9EdQ<7rq<2E^MS7DGK#(R0H9&~>Tlbzj&N<_pz1CiP?Xkuw>n9=k>U`%r z=QE%AOaXjcQX*RSvKF~KW%D^WNjiUVpxd1yJ>@J5a)sKssEcBlHL;c_*@@=1qwPA8 zWvhQi@(+r%l83@lw@q&fwy(a z8D-u>cxY&>*u`{CaB~|G@R2}g>HabpWTl2Zu2vJ-H8X9KC6xE?fT^FO_yvl@5lGL>4 z0VS?HetUt0)&KSo$z({T7HlhhJ*AmBa?R#%Vm0;s8s#p3k54ttc`S&SuR!=oE2U zo{v+*#^gICMPJ!^)78?=|zD%36+2MoqJvA=irwIpE^ z8KfdF-gqn}afc4;_CZbCg1f+U`^+;%<=iptMvwC8W9>j>MP*P2ghncG9~MXusFoaM z6#nr;aQX676XSMbU~kNWVv*Ft+@zOZck9?@ir{huSt$)wvzi{4l!?&jA>I;U=v|d1 z?rlnR2{A4WaWM<$ZHl!DX(*936;rC*ODAE!{9K{zf;3Su3C$~Ir@k7mCfT5g4K49? z`vGpd6Qo9WG9&LqP(|#+D0gF=33jb@f#$#jhyZRR6u^<~UQdq`j1@~9SHAKgYG2Cv z)jCr#K4*RM+GsV24pEkBSF7y$t>WX4MraXv`;)bx(6rI8jqoTs-!ZoHn4oaCp>wv( z%C#tK7j#_YJfbW>%uOe*^Gl&N{6RfxxZIY6leVqV6yu8BXEjf6>Gian4*|VY z))GtAPfTYl(54j7XD;OHgfj5g!!T|63B7>YjkAu1H$VbuM}-imaoXr=EHqp=oyCWs zd$~PjwdS4wJ*N^TX|kAGSG!9%1p7poDv{=a20pNR_2tQdZ1AMrM=FV3rzYRM5QpWs zQFKtGym01%YH3S>Ewi}^GUbB9X??ntO=o;1|#0@7X?j0EMw4P!lEB(pW#3^!izqeinLuhBbmS}m~mw?rh zoLR@(4y^$7@)iHFB_#UOPY@89o$wr@aAYn&L7kgpKS6rs$m3lC4p7V+K1&NFXm@Wn zfU)3~j4Q6w2T>%O(HrsZqbW8Ps#&+n0`&QV9(ajnbT;(6!$SHj^OrvK&Y(1#R1|FK zcodC8`vJ|T^c&+%yzoe*9~*gn3_qpfbfqxE#?5{IInskl%2Dsjq6+GfejPrFcLQ_d zhK;It&2$hIODEO2QXdI*Y=Km%FEJ+rn)A`Hk^_Q@$|(!r+nR7lLj3}AT{o_y6S^y1 zunsxnVzSKG(K+_L3$+Hb+SO$fGB6Ny`wBdxLtW`$#96#ij$o|5;xp3sR&H6kjkd4* zrg;7xqbdpCjr6|b>|tT;O^vAY>LS`S2I$E77j0*uK%C)74j)yij9v-s>PXNZu3vWH zQ@Y~s)cKOXAcC)MDwK|^J||;-I>(M%_HFDY-AUR)7b>(GM%5tWm<@lruswehD}b6l zp^JG^u0BQj6pYDP^js``0PkVbzZ<6_Qn+E@o~dnw5?X-vu1SKa3I_K^`1OZ4&*B=c zc`j;4z^LT~^Eb5fV8(_vJD$#4hJ7yA$t`5$qgUmgq^q{bPkzB_2}y62Q<_nK3v({p ziVwr6I6Tt>G>Vh;jng-=ua^Q7FbOSEzC2C0o!nKcM_>3I7yp>L8XfKuDXRMY{ed*y zZmqJ>BgqcF%fpV`({9LbtDaLY!_fNa^bdo!gc)Jni@D_rTp_~ z;ken}!#ZUBC7$+kFE3tECDP5fbxYymy>Xhy!!IOiwn_v>A5?~}+wVNN`SlLxtro8E z-AJ*(N8JFDSpOSX^1m=DXo)DleJjqeQaIb%q-?CP7atL6{m9cNatNZi#Rvm8U59fo zmAu(%aKr4hyC?s6owsu;c#wZXhC)V#nEmoyeqNI2mz6-SuD=cf68)VQ19||!g{!-L z>WvMfTbkQr^t&48KflcJq*F6&k*91Y%J2eV7p5Ug8C{St6s0gi1AB_*#d$BW_}H#? zE@<7`H@L~MWICflg4iSH-GpaR z+7iJhf(S`-rr{2~mNtS}P36(T9yAB1Uv#O9tJ#V4+NNKCVP8`;K;rAsL5h7xhBck5 zoKU=In|^sz5I5>N=1uaj+wKVyt@qm$I5u7;JChSAq>1lU@| z0CQ*Wk6mMWvCJ~R=?(QdI~zeV`u1t~po?w``d)`u>(+;oC67!H_BAfWev0u6oL>u% zO#0Nm!n&UoXQVcJZ*IKb8A&^L$xZ6fdF50Vb)7f%T=WMv3m+%i-F+n;S$%tE8przs zO7;gN_WQnQ53DLxob#Ec5IMT>&IxARlQq9?ghwqhm~a|Ff?at>VfFbEItoSnbHZw- z!kiu`Qv5@73M{~kuBr}8pLD5o{Uv`F*H_!#JK_H1-4frw#jUH5Y0B_VyQy?z;|(GU z+8KLBGbPj1jPk0|n*trTx$+EcVMGlYF1cp2b9YlD_SoZ0orT%EyQK##(gwx)K1tUX zLHDyf;C4YpflFJQ{4E`)C*q?7*3-n~+-Bqjc;8}9axyq@zYly_Epx`q#O$_1bPrc^ zfQelV)3x2(LoVqbYd0)tMGP-|dZ~`c&vmHU-0(^?Q|K=<Ia#?&`=0IEf6)>;8g^X`L|T*NGq)kz_Mn(UEg#8Or3R8Xr!0I{)lV zoD9N78U={)LI>9pI;Jm4Yba)id@Kn}jZ;+^FHVU{t3$fwj_pGHAe4$Uxca+w!wu$6 z-WE$~e7n}d0<#b8+}Wir<)=IKrn>SEJV_J#PJK-rw6E`MXJ zd0IBr2{(=o3$LY(FRM;5_mJ^iGRduj!XoVWBsOVcG zXY~~T<;(V-5+DUr>g%+jye7OB0I2?wFRcCXqjs52(m45%w7k7Nl_NxcFYnfNag2vX zZI^Fv9U%#QMrMSwD>1!oUSh*svCUO6NS$?uPD9A|kV#c~yW)tht?l50aw?|sYA%xk zI2&s~1iRVsW$0$KOWSxZi`tD5uX@EvO$p!XEe{CGldw)8!)-9if?Eli_(7qQacrEgco#@HCEwVv#2 z?v#B-Tn5xwBv1jyB=IMWsrK16pN-5^IC~>V-gZ(~pYwg-zCtrbt*jT8Z9qnv)i@Tr zTlU$hQz@z$OL{{r_r@nJoiMdZt{W>wa|P4Knc+C78#tVxh`)^s`B<8ubSf|VTDk_vJAIDif~I{(_}^yrlZsA&}p$<67U9nO*L4exV)|}qJ*AC?=OJudXEBVKkHeAw6J^MY4(0AH!_5BlVwc6v>Fq@`{J0^sjrGw?`~)7 zG}34K2#v%5-N@tkM(|ziT2uo^HeP;Q{_c%>bBWXwM9x&!_ttOeVx=@>s{*R52DxFK z4|4vA|O$2h$ZDP3P+m$Sx#6-F0_VZ!d!;bJi%6xAyH@s`y+*Ttf0UuOElp zxZ}b(cnTWQR)l<5X(%M2V4_)<%zi=7MsURv#5OH&RjKc&H`($apUd(a!%>b!voxQl z74>SO`jrNo(E?h0rgZk6V!#-31p>t9kAbBAw#0rp&ImUT!F@S^B>_a6GAKTJ7I}IB z5FxqKpamo(PG*H?+#L(YycUEDzABlxg|@o2kc!}aK%W8Ew9}Tr`?9QZu7aOK_WYA) z*Mb?j{Dw}iH{RD{jzjmUJ5ZehV+m5DL|QEp6HJ%_Bj6R20x4>8MGkcZ5_^hIF^)kE z&k01>wGMCX(7Ln`27`cp_inA+749E%i5UqlGLNa^cHy=Xgd!6r<^kIig#PRf)P2x} zwbb(11FYMJhPt!#5O&f|d-(01$40Ds!$h^Ayi6gjRU!ox(5ZeZMY!(5kxD&w)Cfm-cP+UkxzZOxsR~uC7838UZNhciC+cY+m>_Q)-wM)9*>`A5W`Iy8WA&zic+daOiU*p#Y zs;U(ZwQ1oFg2tMp_Fk=m>YPT#BN_olccheOv{V&8HbpG)@}5d@4a}Tmyj9`v9HJ_9 z%H-Cps*SNXbttJEqC6|8R3L6=Rt#9&reB)>3F#BU-R0$W8`UsnQ70r+n>r$_ z65$+6M{Ok`b(wxSw=72-9i05AU0*Vl&R*$Wzz`c-0+oyz##+2=2MUJsIATp@AWMHa zZxwAmxdRhaRYCD`w=I@}9bUln+w2wgNP(I8X%($@N}T1~haH-D|GO?;qKx)*tA?M5Su zJ02{meJmFO-2ydpET*RkglFJ(sSAq1S}AL0EkyjizO1zw3L5nDzN1 zYFL?!FmlgCUANx5)9oAWrgQT(s~eBmlt`7qa_8Nbg&%>H8V91l2uDC8%}g#0h6cA? z<%5D%dj$dFLIP*#iorunGf4rOCk(_*$DmHX}cJTpR934JZ6 zq+b;_T^#MDN^_vG8~e4w_Vzygus5*FkV7OE5<#HZAY8(dg;+)z#p(ubwoPmzuSGt; z6($;Zp2e1TEv}VWK``_+#HO z2*T#J5ar8+D_^$1*n0BZSg#^p3OViJaQ{n%3&_YYtxNRekj=m0({$vcySxPnXe${R z@kvo*x#IaC@lEuHz&Hx~`puGV?Vx16w1w?#EQfAQfSDa7-_$Kbt7r0(??3O$-G(SD zR#?(mP%*WRYOAR&tgO(jOu)4G3y>*-gsI^yFUPjI+q4%`kPI`0I?JoK^g=+V?;hbO zhF;IQiQM#-bv(hV^&aJ+>be!BRH#Z%wGr39R3^43KYqHjEAH~n&aXBh^39J>;lUD4 zhe)M}0*$3%1S+TN4WdT2Eb@e{9viDQasYcn5H6q4k*pweG{p5qUe3C|0l-xjmlT>y z(t0*@vlAc2DI90w>P^;4I|5=O;L`|Q^~HG{Ut7Rrz?TW( zz1nz+N?@*{c+-b~pP&yJXkmh?S?4b;MR>?@e1j>r#0=+&F)+kNbhTPy%nFZ|HAf4t zE=RK&ah!KBd79Tuglp*uzqE*DydNKLp43Lks!|zHn{_Y^4bsOqVRc%So31^2w5nf^ z{II%ku}KC`hI7LVJsf_HCC&C(viBwbQ5m*2`c2Tqn@>`CS5!~qGWq77l)l8aWfGfx z;&P_mZN3LKN~SinryfX*>{WO;+l@vGeq(+MUtpSg`Pw$aViX8ZmT=45&7%b8K6v!N zTjZjE1M|x9v;K8|%P56r?$_bEPGnR|rN?DOA9JPXyf-nMz#wrJzkCy#fkV@TMO_>n zPy${&dL3(k?P>>KUND2Nb%<4*q_x`Y%Fecfe{3&Hqi=yyomwOBP)y2qiwQgnWFRBS zd!!$_YTQ(G{1eo6@T_C8V7yXR<7?y$fooyfYB}MRI2l$u$n9cZsZTL3@KnDnLNMv| z?D6egZ6)c58pZJh+UM~8JDJAh&aKp8lSSJnv5Dqi8g&wgg5^>Ac4H{IXU2mwwpRQ|=>8CT@37s%OeeoIEhG42- zGnSJ}&b9`qDG~UK^$6jq1!Tsm+%v1k=0W0_ZHWib4Pd^*NW^SWVO_`*JD*2WvnF3p zIkZq!7%;bSI%PSXYHUk4~g4R^;OhQ3|UU0XmT2~fGx{|t~J@1VRDyANf5 z6`=V~D@=B!x|X^^w>Wt^wLcHh5}{#bDqXNJ2%9|b(p8tGq2{wk=v=h*E~F+U1OpjLMFqu8bM7M7k%+P`DvwIl6I*ua4cWsV)fbcOS}1?br{ z#rgY-Sx1SLAwe~sV^P^4%~M7=4S^1|m5Zrv)h3G;S7fPZsEuOeb#g`7Bx%>gSPeCT zyp?{n@8kc&eSZf|M+9gi6z=;-?ih@5?H0Xw60_>jBKJ&5po=*66(z{CYis9J6(M?- zCr{(Hyc3Y%v1e0d{H{)~6=vd-ZvBIrx|=_?MeU&Y^C>Za7F07xRdNQb&RPZr4lqwL6INjXbe~sG$?10Ke7w5USZP($2j5yGw z=APKkqMk8jdQK{FKA7e1`4`y*ovD zx*qRdRC>ADZd!OtxO`;4d>iz-g|tqaNzOLXyCnnM0KptitoF5>h^~K`*IeuWXv6sx z*Xg{~e#xEH+dz_m>@I1lwQ|qqJB|k) zk1+Lb!l>yigbb|k=F5G1pK!6QU-}Tuq_u%Y?9te>Wfo9ZmC23I?cWUVKd#+qGdn5i z-gkD88Y*Lk8Z<94=i-Zaxz_vBn)Jox!5CCCis}j)0ePPtW(wkd+s9eLewE1T=}S>K ztA5RWQpkCvISXv^KzeVGXP_p!V^yzq%w1oN|5)jILWJT)ggEwW=P)A#FLLRq{!U6_ zb__WqRZ92^DKEryhgE1VCb7q5B4EMCV8wN%!7lyfnXvh*;$`qR8#Tnc*y`TS5`SL^ zP6IukA{2|J7ScS<4j;>0_&Fs{_MLS;Vv50qgzi--!`WxYwLK& zc*n+gPkvC2ke8>ERquiDCt6B3Oi~NzOJ=6dZ~p`t3-wsn(II3OGcLi^H}x7Zhe?NO zmGwD|O^&iEdpVpjQ|{M1pS3Q!jFf)}$P9FjfRY;AVqJ z?u%079dqJa!>!iH@~)O0ux(F~%{*1uwumII;WQ*yb#{!S_VE#k|LyhAz4hl)%wmUc zb4!3|Rk8Z&68Du=X^()C5JZWJb*JD-O%`FT3mUKW8JsyeNiSb|F}~_3 zb2#vnZC6bwwVtc5tg4GY$G0D76RfCTL4~z!ignHo3puPN-}D(yEdX1R*7B~vDV$dy zoX#Ti_ZdrQ;F)x4!|^wIzJGnjh-&07iB+*#0;>HcomK8$$dHN;o{r|u3Gu@rS-~oN z=jc%GeW|4KYzhVS)Ne9FLnHKD(4t~gH@!t0=W!Ka@_EG$;OpaLH#Dn zuhChh6*`v+{0kLs7Y7y>4hf0AwAFBZhEr*2V!_;qtRo9>UHCl38Nm}a81q=nO)ksQ zapH5EdA|9zt?vcG85>?kO)il`rNT^!+Uc2UW_incQipWcG&J`7+z0r7tl zc@P@zt`cTkfu-XzE@;S_b*%R)A8JrJmB(MlYUL`Hf5K@hZI49=?6$awD?TgbO8oTs zaf})_lT3+Zlcs;W=2)tPPYNvmP{uJBp%V8Xk^vlibUj9`Q?$&OGvyZx8bF=L^WuCk z2lD{RZaLHIdE1~yV7FiQ0492;pN#mGQ5+=U|?DUR3F9C?#+>UXtTQW!$nc?SlPF332=>6x9Sd{w9+X4pn5!JZnfZn3v}yDTD$z{P#S;t9W;)MZttNwFfcStvHtz{`5Pm- z!xCq*7dKhrW(8=3^lG~m+u3fjz8!wp3 z0~OC9xG{mn$v=Y7JeiFFM{S&wSB6$H^sb12BD7*4nDqNTXR zx_k+f*r4&!c7Tgl-)d(>Hci`yOjm^Q>Gi8@*%k4Dp z?V(2V17owYq4j|gVg|RW&o&^?o^)@_W(%xhD3HKv?S*X9W`^tU z+!?Fr^X?NXy312P+JK8!R^llv{4_W)Y4U>$~3EvRU?VHTSB9}(MIaL@^LIH zG7cJYtd|zy(@j1CZ8TchjJtQ@Y8mX@@bq(64rD~6yUemI&RblKZozS*xwoO15{-=J z^z+!L*W8$+G=X@J6z?vU`YaHeI|t)zRFz*thA8)Yknbv(-ZWMs;3G&=~8 z3;yWi>7v9vBZ2bfkzyX;S?l8n7w#pnc8|(&+DY|Kr^KIwuoLpD6 zbzj01acQU6fb(inr92^>g)AtyhjT__Yr5iN?1CNao>g#r7YnoN2Nv<|Z!16xM(YkK z!W(OU6s#zOzlk+VS{z{LVi>XIweY1U*j$M2T%9AoAjQCbPMZD6D@w2995LQj<#f}i zoQ=WIhjmB#eWx?~wd27BfjOCXDjuimRg50UMG@mw0398m0mQ1MBrM@XvQA=K(PWlV zMLVe1m8VRS_58!|yX+i}I$2T=&CKM4`kXwU?+MhZvqo^!?=)5|0BqyC>6H|8FMl+W zjHl=S1esZ&=n<5-AoN0TaulE|rXwktAUsyXQ|Lj@RYzP?Iv04o zywxgEUYSaobtWYysEr4&(Wl?2AZx8DZ^}EO>m*_?nEds76T}8u;hN4Hric5mguJNq zv5hv=b%2@dhRuc6a`}+jJL&Le=}R@`6t#Ts*B5z*(q`vlr@go*zw}moPEDqr&Zb?qg&fD;5^gye)&6Uz<_4C9cspe4+N8*LWGlP4rpIpyS zkeu~0(LbCx^FIIho>gD~U0n^r7iX{p4o$WSe9@rbnQPJ%0W^iirc7TYz8Bj3W-227 zIFP9MD3g7F=bIj1Z<%8<72m|%tas)6Cs^vXS{XNeZX6n8;VWP>W4anwBnga1q?+@o zRvC-)xRBKwQz*O5b$P{^;&peoSd0~P`cbN_O8UF;Y_`o4#~peax&zCxxTPZ%qGG(* zS$-3j4M&R^6pMNz-gwSAKkuszwfdxo6O}p|P-tA1=H2IDKiL$?So|4R_btboRWAs$!pUs8zaMyB9 z_C76@5Q+O1{Aq_*u6!V0N6&IiaJuO63MG6*Y4$olSec1-)XqF!gzoSnvR^hV_ez0m@u>W?;YpT4mDuMhhYR9 z-HMAwZJ)bmLUUAr>}2aHgk~1!?<2aTY1PgkTU1(Z_3~}^CpyDAA(n44#2oXD{YMhn zNrUh5a^A6j+?j)wdtHV1-*ci8k{agpa-0@KuFVLaE*b<42(58VuL7)e?LwU6vPz`y z^@V{PwH$an|Gl!n8!qz7oO<>f5BOFhB}?)AUprSTN@Xk|=Shz9=XWye*Nc9FNZiKT zm2=o=<91lm)9vlmIA`yv{RBPv|3_wkkXMEvtxX(5H-+NG4H1g2Rn-Cu)?$v9Ul}Y1 zYRm_>P7P{VCt`Z$VOE1{1HSQPdyNDYy1#r!Zif`16_{iHg^m z#ba*jJ{`w9B6GdkOlO}_L~4Sc9-ZU=wc>-B4i?Q zz&d#YRFoqi|8l@YWINp;YVAz835kD&Z2dk75WyD+{75t~5M^2AWH^AJ<_lScV9&dh z07uf;$@mFkhhXX?07@xSWvZeSt56i<1bSsaHPOQux9{mdenJkSktaxrxkr#0R zd(?)!)c9#*o(pqunDBg3hxU!8!2PhsSDZqsJ)kolqKxvWwlfd3L9-(fLJ})C(^P~m zIya=*XlHckZPLeZ{ufV)_@i6cACP5!`7L_$@00*>hS1|l0CiEq)lD{%C|y&TO5G5p z;h+KPtvJrplmwaS$jl-y87&2i#O|kGJ!>7ax9jYDyi?ExK4zct?WU$Mf&K_Vv;s{V zrK^xubgqdZlmCl7L|&YfeNU(R8k6*p@oLB9$;9<+ZLGGyTY%oXQiBFJL(=|Q=(V;F!r@he3msXn6Ic->qj-j;Bmz4wp(D8@tii~MyqYjFc0n&tZ2W1k zm8A+GKy#~^TV5v8N8aJ;lkbmo^SOQX#*dp&9{F*IX9Zx33wd2-3OWtM!T)K|@ZWy; z=jy$4V>Z!IZNdR{7C&M!^)1PeAOT0m=MD8i>XoWGCb4bhZ&*J!KXwNZy%M=~)#A5H zc@SRTX|v0K!n!Ojr4^{h!`cQEciJ+j8Xl1(q)TB7b$I!$XkMMT+ERV
~jwu<*@ zPkyBsf^)&_0JT_hJar~s4nrSC75DzTexIF{{pHh7G6G!%av}o#o|aFw)WnHKepA}! zUyF)l5vIN)^bZN+Rmifrs5fg3bqTEzODS`fa-TWMv}KkrH^@1wwOhIG9NRWLAS?UL z74xr0{;jp8J6_ReZ>QckymLRp3RN950~733?9wT!3}RnQr5<@DpK)k>mVlr z;XD?vI+~3OxKSG0Keee03toArj_qO_r~Y_7PxTSbSxZg28T5}0x&HgF_z!%O?0?K8 zMA9DSK9+kOuiW_5OCP=duEKsA3c~D6iyc0|3~j}tuX?vCGGcwJmoB(%n;&>ujP(+; zFizb3beSkEPYS9p{T&#q`NQ?!Cwu?#FQz(=5QJR8Msj*cw%v@+YUW2ZR^zzr#$Gjv z*lvH-iRU`TqTP2v&CgjwK_h>6&Hn!-fX5z-2`y0}^pL8AF5xH?VBlwIxpw--W&voX zrtwP)Hlxe8xSy2Pi{EOj5UgyidW^jVdH6e4)Nk8;?4wxq#wexq3vOi9(e|8a8p^SB za!k!Dc`vSz{a;xfhBjzLSAZ|$yv;(Tc6i0k6?mDZ<&vF7nBW_wCprqz<(y9WicgY) z`daJ4v6Z(Jn*M1S@W1u5e@l?)xAW+KJ<1=PYxZv$7ys#L|HO7O%Gmzh*-6xdADXiMF$@5D)Zi1!ld#5dVTHen#W&r>P4cj+EEoSjO6#UO|^z~}Z#)!zaUY`mY5(Wqt8|tv6pXNDm5lhE`xyxYc{#7+cN5*)O%qPhCWDUDe-#+BN?}M@@yuVfUnqY$S1moLAa)DO;iXcoZjgzxV|GZo*j!|yFOgB^L#X?CUV~Tk_Yvta^)UT zj@OmLe*FH7msQ1y=KhI6*xxqbe`MVdVKTOd7I(r*PVeExaA`o~%!XRF69!vXA09ni zk4_bc^$e!1(Vua;@gsGqp?_J}vzP*|BxGi1XSX#vtJS5*7c^lb>j4xi`}c-$+EAO4 z6F`&+XlGdf5XnwEph~m^86x;I5Qu?xBsP^pegd+MZ~@usd8&yVPQdLDeu5wbF%|q> z0Ezr^=O?HcdHRIVfgHjiTfqO}Z~zY}7zsVO0KGtbWI-vayxDpa5TK+5t^}P@8@e_oM0_aznglhl?`1>{F zBJ}Jt@+oQ!4_yqxb7xdco-rYT=A|8gGrKH~Y=>r?kqG0-fB|mcEd$^eLHNr6=DK&7 z4B)TA*o89!6mJ=Y3j)Yp?*V=>pon+aauy1}1pq(IAA(8u9fj>%5afP>P+dPk;llus zOF-^lMqwa?DQGEdj{w2kCD07b*b%}}UC1M#haZ~^{Wp(|?60Tx*Hin;)_&&-r6KD& z$=#M&b|kTDx~x65*QFm(xV|?{-p4@Abh&uD>A|4b==C}bE44g5Upm($ieKw%{q#xl zaIYy}bcW*KDY8ZJ^ei5)Pz={Tbj7)#ORlVYFgq0Fz}*LDc1O^X8YBgbm%`;PdmDTw ztMH2Zr-wCU#Ms$oDZrgr{R9OV15INYjCVa1(>~mSY#l~3an&SEaXu6xrL8Y3UTrOQ zRt2C2os=;^g~;(#qj$OD_K&#?PF7Z{(}F7`EF|c zs=gdZvHADznZTq{>!InUuEUTz9K3EQbZyji=p3$iv_out)A~k~t~I#tV$8+^I#D5X zDmnP~%})CDfcuwNP|3~C(c4}kZG~K%rgK<>rTCkxxqlYhdw*pf;uxGB#si(v3^XwJ zEbaubT`t3{MoFwB>>AG`@Eu(_PkBwKBZMzlM*jSRn*hDJ_rGik_M17>uf#3?n#lcg zjzX$=Ns3yYfts_lo1#uOP@nA$TLtLkR|CZV^b$fZrr2e>SJgzCvs7q3d)Z>D#iS*B z8O~7&(g0rTKR2lPZ8Q2`em+uIsJ!-jIDea0ZWvzg)7_l!ClXoJlSi>vu3X^I8C+w< z5G^`;KOpNs65;P+Qx2zbSo2ero&vjhGTgOAll&!&Z}g=hM`%d#&EXywXIk{q1|w_U zPmpe{qRU%w-G6ETzxv_1oRUvd*2a22sy{szpf~nuNH7qsR$YqFkb}G@`9jtO!)TZa zKb&Zy!VN7~)zcJbW9_<_OM*2-HbL(?3vl;^JdfDk5gi8nWk`SaTB3eDYhpI5Vdd$_ ziqVRoEmil6xn~}z|EPuJ$^~&E(x}XJMaW3uz98BC%F&8BMUtwD^2&gxf%CLPI^s%1 zdbdCzQUG6c{{6m~*Un2aZb^X|=uDg04mYTC$hQ-W`&N<3D}Uvh8i)B;(NeOhFXzD| zaE?|aC7wSQ&e1$`wJGX@jYwi)p?%9P`c6kM6X$M!SBV$s>+jtJ|3nA;&vE1b)sBJ- z>sVk>U<|oP208KjTZKui{Z7nRz{)7_;$GX z2I$oN_Z!MD5>|uwhuCEx1$^Us69-+iEOPOh=f^e{&8;So(08h~>wh2I=6`+qS&y@Y zgY22JbT`?@-w*KoiBQ#_y~zKJ^GOVIDn}|^D!gF9!Tx&c3z-CF51r5kys{afzX4tS z#&vT4>-?_3&Lu& z&ohj_*arvAQ+~!w1_|xFSPA5>6^csXoY-0v2rWA7?&IZHW!k<4dGeQi{HakE?O3u)!{e&oZ@vU!o%gAKjGEuoaN>S>F}C!gMXwNTIZb3DXvg{=ye$l5*5xpC z5@qS=uhPXvHL@iErM2~%$oOdOF4et5InY1i;A)|)53P-SVIIh0bI8uNnUi+AVpD%S z{(RHTYqgiH`mSuVv|6Rrmxu3#g3ABOasIUJku1~~DBn~Z0nmvB0M&B0CmaqG;4%!~ zt`itA`%HEb#%G`D=nPhO$I23(NR#mNiSmLr2t*5rCu0O^xZMNXLGHSwqD*bYm5r*I zC)XL1LU;mK zXb`Ei^X6?daSm6+0j)P0v$2AK=`A6r_b-1w``Rvc>2-BXiwAy;XjAu3-46eZfb4%H zMnVb=!<(W@KscB#HaFZY#h&}#aFN;W$sxLwRfHYr(ouND#Q7G)>354w^Lcm1|S7U%N~PFkfrsE_eb{V|vAFCnC+XUKf5`n_BvsV2yW<{x%ff0^0udvru-WTn(r3uIfx z3Jt!9d);+X9F3-g7%?XVyUIxrNqm3k!Ya&uZPd=spPbQMdOok0Xq#CvIvb8uEBjKR zAUoftHCnV>Sy53Hu0nhHV}REc3yEtoqGn-g;S;^E^h!2OOtW-&f7X;kqe{b0j*+9P zI)JQ?RrQ0O%4ZO`^_9SX*>d*(Fel8W_K`3Oh?8ra-nCsQ_Xy$`JJ7yt&8&TxUWBx9 z;2hn8yo6XJKkzmB-0S2oGo$)M7Z|qjj~mJ_PmdKAs3?M$!X~ss=Y_;;AYMuB&|f2rr;|&98;z0h!J3TO$9%-q4@uAz%IkvAoBXxL`8dCl-iT zM?PTtC%|kGLOI`_gg$!(yQVtN;H24Rc~;-__zM~JDVJ}5GGwa@GkZ$Bjha;ynzSU0 zVy%L?1ok$P!&d4U91}YO64@0CyH?#fiD$q6jbP~S!VTG*8eVIa){F^x97~qUkG#0E zU3r_ELWZ08{R)8HqcHb@IeDAR)qcL$felq))Is22Cv09~%*`qiO`%4YN~6Yl^Q|Wf zm)(0ZbHABaiamc|7yV~2Apfp={_lSxDuX3R2Q&yqUQsV-w`36RilQ>KmBOsMS2Mp^ z;*8(WxuZ;eazzcTXO#Gg9P~Tp$N!0I;(ug}`iC=R2sK<7?a{71({@-m)+B-Vz_{n4 zcY;tC4tQNl&cRLe-ekgHndeSYyWtkyDmtQt_il+@NUR~7t@wl9nP_pI{VY?W9yv?% zOQQrhP+sO)-i#^y>00|d$C#PbmGx3m*H?T|@8YAMkw{7>T~G)nYxpae_=8XVPy0F1 zURR6u^jfDc_$)_Jl<);fEg{HuvetFQ$cDgbLjK&nY(#LBI;) zKXVS$xy{sd%Qp^z7p;K!94Q+27zShsOF3yXM|JK zVNim0U2Lb5wI+J%2WOp!pWr2b?U?K8nXhXsoZ6$_sttg==>Ayd`d8Tb+nG2iGFW_b z4t9I2B2CJuW<+Sm{zi56nBcIPiP}|)!tP><_&Y$4UbtU64{*A}6AK(2pls~WP35aB|^9AUcc7dM97=?fi)&KBfCQiXcU>RB#$PqLJ z2qm0fVR0OK`wF$f!ScObPgTnxsq;CkaNBwdQYhhwmVS?-KP}mV1qO;c$t-2 zQJ%#$D?gNI$YYb5X};O}S(=^A^V}=(zg)&2elx$04*;F)`MK6!7ORzoW^GpHxh-7d zG)0P>pY3nAo(lG6^H_rMzV<1DIvchJR zxN8{WDNgj|FY3-e9)^>J`vThG5DD;f$>Q8X$5r9B_>+~x$%(3`Bi?}l7vc`%zrIgb zd`GkbDK*$eFM(&uyJps2OCW>1kU`5M;x74xhm$*Us;$<`Q+L^$&sJU1$A}Ik{^Q0^ zwzhz$WBH|r4C-IE(>4nWNLLt>m!mJR$?Nf|@)uhZcLd$>;Qh{ok_8<=aW`^1kii*@ zO`wf*9$eB=0qU~tXL!z!PPu}&eiP@bD&4f-5xp9b0TdJ;kUDOg0CX=1&gWDf7rKUC z3usW_4o8?;Rd1IHcKi2s1P}Ib5hZ1V{2v++OAB1+03o{Ufggn9J|}E*t+fi1YdrVe z5_$Z!i|3@<$nPhp={(>xPT76Gvay{NI!I_<=|dluqqkvq%QfnkcDr1j5;t||rrP^# zd0Z}HM4JF?ql1j_1&AW8@6gQEs9=HsF3&n%Hf~|6=&`Z3^KM^h>Z`YzRp;{isQuoO z_b`FJ;+(K7Ex=@z28JBO9Sl{Wn*y<+5!>MR*&d>|oDnRDm``=+;UIR`A$8EY^?xl7 zK?Pe%h;l^*WKcFd#Bv*RY?(2R2vr)rOZD_;Olk$Yv-4c-p(W}f^rC1R?6KvG;MW9x z#BI!vj1QZNTov9E<)y+ag*OZ?ch~nk;f&C^MJ`PRR5ZOh{-eSCVbJWqLHLLW0_;#5 z5QtSBhi*v$V|Bid-hy!d1hue1S8CxygbqnSGq%qQboS!?r6C$z!d+nQmjsj^m+~;Q zO>KCatFf`}9sYA0bG*C4AV{7{`}QqJB8(ojSVEAgofLY}M1g<~zi_qA;-&o2RQA>M zao%@=u0seFvhIF{*-Mb9A{%fj7~!T(a>Yfg82TtsVu1ZK%~e6m()tJ50)@N(hr0I; zYU=H}h7Zzf=pBMI3j#`&8f+910w`UYNC&|J1PO#*q=SHf5EP{&(xph~pi}`xKuQ9F z2uOk&4wA(8{GR##`R03O?tAWe=9~BVUo(eu_O-9Q*V=n66EWA@yQkYBwHR;uUn_Xc zK@20>pD0*Bg5zoBfe#`qSubuk+TFh1LdZqLe~e~`l*+syB}spDo3h%S$@7@OwnNFsbq-Ge$Vy;0MlJSg^XR@gYpeM<{qB@>?>_# zr+3#L@{x;I>VSyxBUaj{ZY^q!t|mbOIRu9_Q@Ozq$MEZwTL&s)ep~A{UY8Th#zn5& zIH7|0^RW~rL#@ZHzM$!YoFp9^&~$Yru9X$@;>nZmg}f_fju z!4|_(3M=-lE@{+|67AoD$?CHKVFbs5;CX46m8$S^O}PtjOn#3{ z9S}K+vL)+K)1ZSv(!}pyrhPPx<=_r?SJ64llh0}eRg5o~uP1#J<+6qn&ty6`c(O zH;Z%>LeC$uT1nHN!T5+o3WMAIB^)zdLOz7iG`62Nj9*z1mOm&t3ltYuoYcVEPmH;0 z06TQ=(9l9(Ob3dF6k6W;3(%t#N*6)jPd1Mh{sQ)`g?=?t=XrN{r3WgqxfA@|WyLFj zqXScLz5@EPepB5#XqUHbGg)1V6@CuAtx54he7$1A&n}wCz08zWGB=^u`$qp0M>s(d zI3kc=)?UML=+fisU`DpmeAUofNq<}L$T+J?rudno$k1A9CRZ#Z1VDk-qX$jv9X_M) zftVL2WmhlX`>rpMZud$N+Gkc26}cV!_+k9*^Xpz>#>`toiFI50avy4Suati0Uhye> z*&Lb|YSH>_GvNwHdc~X57vK)S+kf4M{x{j0ZXY&0nS$d*2Ol|8RH*rp%wXf*L*(tCpqnE7`ekL^$TGp`q02imX7f-)WK`wsZtZ?cPfzbwdj4QUNf zLaC*=)e5XQB-B;@V;Il$dYPSAD+H6sm=E|+zb7*xXtEx(D4ZT*)T|U_n$;1dJO3B( z%~&kLFP2x;P_BZB6=x|xQJuCcHIds85G3x zmG42w`vLgb*6VK6SFR~J3SXzs&TJ@tkK#-X5x++V?b?$xX>6QFo@fKo;9jCGqRI)p z=>1BrQBq>6dFqZ^wubAsAK!j9moaFtrB?OT0e72dA2Dqw`kZW*k-U&HcJzxc6OkVT za7nxC!37sKd6u3rh6kM+2O^))daJFHt0Qz)m2N~Qj@FI&Fw1@4!p z*(cjj7ZqL^Gy5i^eF@OvA9w8&YJS@gh@X;DnuiLmq*JI07IKD*QfXyXM&+Nku>MD<3LZ6$Y?x)VSaSbt-oJ78=@jGSJc1mL zS-6i^!|<&Wv%$Mq%l6kt+}%fZk~S;YvHi8}&hq7RpQ7~g{-afw|4ZsWSSHkYLki~m zKB5&=ND-SV>P^4?7XW33Df!AGe}T{CD}U?*p@jdKjwD18Xd7yyNm8FS*K_#HH5#Ci zg!76&K8Q&M*Pb0!Jx=~<1F1*QmBI?3-+YXt>xNUTNo}bVKN53)p6D-q?$(BczF~`K zuhye0GuN(-i+MO@1&mnqMBN_wdI6Ap1{F>A)MCh~!wWwYOG@ma!KRxz9VC49PFRY1 zJa;q-W%zLQ`@S^r@&f_TP`Q+8!d41+Lh&Wm?=PeAmydn#5W?X=XOCXF{nFDbwd z9$$n}@AK0U4~AhpzBoagH%53n+xhL%WRk9But3fG`2oY#;X6@~V2lh;=`U-3x}&?R z%JT~3a3Va)A5F*H7s%!t(n@|O@NuH7toOd&uTr=2f^{2DPc7Z2NU_ruG0^{*EKhZ*DwYIh_%ZOVlu7@n}; z(i2{+LFwZC_UFa^BGb&eiFHAiC97B&R~-GweFG5$Bu;o#gj~q}u^@6zVlO@rUx4X) z?iUIN_t}77lmA8q!=*}wuIQLMeh6OO>y?ySAXA<~t*^(cU@Xi`s!b@LS?-nMIzlRI zf?X!pY7rY^!a%zfTk8Kg~Lm%hK@|&i|}i7n8pfn;?~X zui`Bb*%3K_YZWXwvPUUfHqQMja#9@SK+d)n*+T%_8^lF|x3$7h#QprXE|7^@clv}SG#pP}pN(PVXcT)0feLcIU z9QZ@=*`bVrZ{{#tF1$~F3Y1`4aWfE~qwt24A41CLfiP4PVKW`AP>zZrk6t$ux|U+q z>X;OuaWQ*Tdg#zx@Bhf z6N`MVfaIymK5@2EBf?RdZYJxz26}-smcunCQ(Mqz4N328KD~&#?~w3)N(1>g|E|Oc zVt$2QL(eJWAwZc0f3SjINgQ<9X!{yh%;I`8-Jflu4vn?gvaJ`0+a>e%~eh6>p#d$IEf zKYD-iRy51^8)3UqsP~(d{B%E=cTA2fDaWb#uhjQ=P{Vkr(N5Mit?aV2^VQhn@(*|{ z@vX$`^u+KPBa3RCdvp_#^BBo){>Tr`Iy2}9NUWx1CGIy<3rfB9NgpF6I{J~7^y(X{ z_hVsFQZya|_{$Zd=?b3{m`FC7+S!xKzVytuFyTp>jG(?<{)7eV4#Sbb4j=)w?_U2| z#j>%&h4WfTbm7pK27eMN5_ap?NkRjTZ(jXQsaY%wvK{4YJ!#4@ENE9;AmwJzR~<#S zrS_jgFaJ%=^sjzQ+-O*t)0d_So`R4R7H^;!d6PI)RbhYf{F3gcMp<{g#xX`9k`v7Y zCI=8z*D~U1$HDx*y6QoE1aiD(arSarGNc*Y+#6 zq{TO0U^xDyfmIZ!W6m_d&WFaR^^XcBu6awV70W-8RRzDf^0!XUiMvx_=z?EAV(N zYGm}I84#IA8^*OEdJxI_ti;LwnzXCKz4|Aoe;oy!d*Cc+6wen+;weH5-1dScgFG7j}TAy=m`Kgmfy<> zBd^Ztax3WrlO%Or1#qYdCSg|NY57jgtO?LlBNLSp55 z&VV*Nk)R#l{Zh+sd;Chb-Kv*=j!e6S-6s7R6jK!94BG9;0a7k%u1`laQnUywf{KHb zT7O+FJX6V%OzN?axHjtgGeeUwc5W3DAc)RcIRoB0xid)E_6y`i4G;g3_-0-30LS-E zZ?JAmr(IgCc;tQ1jSkmz16LyeUWaXj@2Z#4X0dFyNVS5NTOG?-t(o8BTEdNvON<<% zDq-h3GDn}TZCIIDdDH(4C%>n@r0G($sr3}Ea>!6*F2(;rgwTz~yhW2lyEo}0rxy)2 z5OR*5L$W`Fd%xWO6s3j9z_f1e!j6nlxk1M zOUAO#i9+QY0F@r(Bi-3vh40Wmquc=dBWhfHA9`ZL@1J@2Vqa`=v>>qJNWH=59zOkB zrkBnQFJT9Ovj3L)92bxQb#w$BFubSx1W9MviB(DQ7o1~Rqu9si&Wqcf3ZxJdYCn#QManc}*<8zTuNUwLUnTwc+SEtsAZZ&L*+|*C>r;o48^(^9un--zYqVXe+|slORn%(+4?kSK+g8}l>~~y? z!HZc62mnuzUlVb=oWy!UTe3dK=kL$D;1x@~?!ghlamzQU*(NfmDc<*;kKIGWK0hdh z4c5oDMjrbVsfe}|C?d)1(6=iWoOrq8LwL~e-qkmBhi!L z#K4`yDzpwLR)$ueMPIKkatzn%()94TFZa}PTs#}8K5LM<$Xz;`e`a+xY5;&NGk7sH z^IE7|XsX0~3z}lC#`Hofed`{eJ>-p~-9}4oErq#4;c;y#B87Z|B=BP#`c$NlXG&kz zLKYQNQ)&Ek%6*fd0%C_s=)9alaL%m*g)FR@Z2UrI{onmEk-C2B_u(`6bpuH#)))n; zRTvNDGz4=%m(FT{Crkhw()o5`MmgZu)^lxt$EI^(9+wZDm}W$?Lj`0xOQ)D2nL7s5 zR7+<@T8SD-y+f0HB{9Cc|H0wfF5glQG4BXY|CZW!5*YsK!Y5jV*W#JSk+;}pY|DRnFdwUND3Yxj<9*2fy-H`z zIe`du=yha+fwd^)s(I&Hn!9CfF(rpVx?yo56GVTv%a00_e7am;%n=uwhd$Yc7D2Fdpkcph~+&MJmFX$rp3@DiA79@p zfzP}OHhqWaWZ-E|$L!^*(fw4W@Lh$8`(c1!Sjq}_aGK%X4X~IHaed%}d`&0B?jEpVZe#@a?aTHC0{keGWGaFpXT)wlAF75iP| z2+MqUr1$W+cp|;Z31?H8bMjFLWqk#-Fo-{rr(9@gA$ip8CAK69FKvlV6g*hIfj8G( z^o#2q8F^p)sx#rMSb_lps?k=l@aAqDWR)yhvN<%7gR1D8i@i4cD*J5AZ#Jfc)k?fm z!dF*7EHe{05AZ}T!W*0*UC6-uc(EFIk_}BnUnu>LN|R6B110?_H<>KyfZ?2P_d6G& z5V{!ez0Wdhu$^bm9nm&zBZ}MvjNf zpbANf6y)l+byjM7c^ORRZXdOrJ}>b^YRpzP6D9K`{Wl12kz3gfTuZw?)DSs=;a2Y~ z_A>8kEDYrOvNYoHra6y6MX!~t<(+COb5iTGJAkeL=!IzP@V|hZCVr0^bqDk7d6YhM z4aQu@ysKD*@^C*S^0ccBtRq%m7q%?5_0%(e`=G$0+;yYr>RS1&=BKBz^-qu#rYPMx z8p~v^oFp-J zF3spF=^FRKn9=0d@jyh8uzsq0TE;(6%bE+(aNLlp>qAm7Hq6<%?h_)U!cqMEM&6Z6 z=>>~2v>S9kLn-`1G$bFvmWqKwVvC=j?$78E4+JFU_wCI$_0-^(Bo`JA2DAmz+iIN7 zxKDI;w8wk8%lvlj_8er6ss>775XuzO=O?7Ga;yvfY(V%5hVZh)MgkG+bDk}-{3aw$ z>P^22n~z<~AI^voT9%saWRXO>}QkU(gY=ep0Ey+$%K`Mv`u^Cr^qgVgt@%z644%~+?V96MG^KhR| zeSG2|mY5z3B-{R`O2 zL46sd&=P9Fyo6!U=d_26D&sBJn5kt4HI7#8b4k&~-POkkZmMh16GQn?}0Dpm?n-GDRVZl8A z7idh}d4><1NA^Cp+l8Pf5 z*K?ri2is`!Ad)1DavrrPs0a`|Fgv;?_+!kJ#ly&|A(c{TokSy_1(@b~;h^kfzcT&G zO+UCG+DSS6M0l8MTf?K#osb|A?usWpoz1V>-#@Lc>ODPh4A9P)N6=Vjag7jtD&K7%6j7jPdg90@(d`s9>P6opM%J_5dD{3k!BM$T#H$G5;hdmQl*oby5` z*W`{%TSVEUcl!O!!KT3cy{F`1DGAI@@zL4AqqCHgv)zKDYnKRdvn_tbW_wJNah?I$kxgx{J01BcxLYJ7|_3l=D+|(AZ&9t@JJjbLvEOA{@jx0x!F%UK1Jy* z-xegES>(MI_{cliMMJkzipi=XGsx~N^N1)0G6He(BdWV(gs|Qg_o$=j?<$=5P(g~j@LX9$N4u5`=P9*G*a8^hy+^PS%9~oO)ygswagBQV04hH4^~qSG zQ3jiD$Jn3zKJ25phm5IX6CIv4Zsd<~Ihj@3BWbT6(B&3M*p+xgs0Jj2yqXco{o!|I zFd_xP|ElYXJ3F(sP5v!^D}M|}PJsluBu=`_y^N~|Ub-Nk`~u}?*gzylmoIMp@hk|G zcv<|i%S}2)c=ojP$<>nVxsuv z{sHy%ouQpO3#zHBP_yuX4HzS7COm+NpfDE1`bW(v0oRQ`N`?E4>O5W%|8Ta~3Q=zn zWTZxSct}#Fv5@c*wsu8Upf$<)r$J+G3a6^|>!W*vB_S`ir8PcGG1>SB-+CBze_0Gl z*y>K4hFZEVq)#%gbp_Hh4^-r{BIw^K&FX)3go^9u2u@wPGPDASfgqMdNy#@|STo|+ zIPKutw)A?PVbN99Tx+Z0Zhn}Xfy93^hV-v6Hx(b?3-l!091Z+(SuLemX=9 z<#;JtpO}!m>cTc3kz+)pVVMftmL!JpLlifErS)XG1_-~Ed(0#?WD5rLr z5D&;mBY2%!81{}J4Z2lLD9hCQH363-NZ z&vjpFllj{S6&s29sTY;wD_>dk;sMm&?z5wyW!k5pZ=!M)QO}5<((s30J5o);$TdaD zFD%-fAJ^Q)ekEFTUB^2s=PLZ5g^~92v_fy85M#Gkt!cI_M zL+7VV2-1|vkB0T_v7tD-G)r%O>Azoz3`Hp^-Jz5(XT@8yizc!S=gC&CL%Dnj(<{@> zhAKQD$TyMZEN3=biR6Ov@fSV^Ttznf`?g3`H$<5G71x2g9als~q@!eT&|^HKJOE8e zm?0VuNh$3)YSLi3%e2~l;mTe3=U1ySI#s;qKcNz)t4m=en(o8n1_JteQ1xW36sbg! zo{JRqhBM22P|`KM{U>7g>v98mAV!FAL6pV)%iukFUX7BVy9VNu6yBOu2w+D$9$i|b z*qDkXb?L2!JGSQidL^B=Pgq(t4MN(?tB%~AlHL@{my#@lK2kQZq*5J|K+{V9WII#1 zaAcAfq;I^*kv$YgK2v^C?{dJFfu#MBIP@f!>JXlY_%^7$f%?EiA!q4SUsYd+h{0E- zd0){V4W>p!#G*#*&c8bx83v}9=djJniNV;#HnQM!R|F)iQ3T_;jVdzzF#YX|sxGIR zN>3w$%eg|C~tDSm@5F={cibEN_z5)me7SJxe&Y z)%f$9{!8u=p_THik`8$qUYmr*omv zX2i4{XH-hRZNqj)a5b!Cd(_C}nTyRibW zIqX=@Z$E_SNcmuIr__7TT<^+;e-(2C5?zhYyV8*>k`b#7Sa!u<^YA`63&k3PX2|IQOS88#_Ac&+nd=om z<#*Wws(!yoW9+r*+JK#%-OMYDZ@Wyz?@E2Et_J?;qEK@Klb@T&u3&rB=quC=UNA?# zLZ`XOd23Gzt2V7^n_E|``$@n4%@i{ql;3*Q5bkyI+q?JBsvWZ;E)V54+8&N{FWOjh z=oP8a{lX-4DdTZW-&&W&iFvUV-xjP3ciI|Xw;CT$dU#q2uwaD4(Yi#VL1hjnP?aE* z;2*pz_zR{n{X=i`&4XYWj&#-jDh?fpiPWi}@ClK%95*@fquymM;&4ACT)su)B;HA5%6-0N8Kw~u3>2K5v51gKtypnt!Q zUr9pgG~s!Z+6hy$%FkwsX#=~dcXCTTcw2$5{gRA;D~g%MN`j2obU)(fnsM=&zP98s zQ4#f&cY9gF_h2~+!pRdXa;(D8(9177Is-!`>c*39JNICB4dBKo$G%8mqJFFo zK9C8-AU~*u3n&D*yZL8c;67vD`W&D;W6C$1{ZIYU^1mXXF{jWVj1}I6JUUO&XeMpO z>%+DYJa=BHhev3bX7Bg_%i;EdHwq~J38az^h$ z@gVh8w0v_n{6f*ZyraA3=N^Fe>-9d!ya5dcKBDH*v`|Ilc+e6y;39}V^)V9eVuHEr zMa``={xK~Zepg#{Sd2el;9l%ww-FI3VH`WgM_Cwsky_R*h35Jg$%KmS@6e2u^6{rw zhx`a=QE42Cw!U_?W`k8^ZBgc{%&ogq09+j74GEP%t3L0Dqxi(cNCq%=NHj2g&j#8X zaB2*-Q_JTefuXy4-?BiM!MyCi%zxK48AI`(3RR(n&P zacoh@8NLmxkgvJO>Z%byGT=y#Hbv8Hx8ealKhsDpW(JXWBja=M|zpmguL z>^Y4a#i<%=_lOUxqg2hP&%1&njvRb(O;7&<$JCE7oOV7Ym6z7;?UA)U_lm~_oMTw* zHcXVZT3kaYpYT2wIqTAl9DqfIIx{8qK;uff7f8~|0>1SXEAL+F#vZ9(NL5Lf6A66^6s$=%06ubw&Dx!GR|GQok*KIic{YkXzAQLO`oaZqhY zRZl4!uIVo@Fv$%~&gen?pYv%DgIO)Uely18j+(-TXW^Xxq-eH1Pix2Xk#>ua5u5{n z$9ZBG6bQ=0Fn;vqDc!q7b{8-_V0GN=O!LvwmGx8A3TmahEBx;FlCmcHr3oJE=$Ab( z0on%_9yH>}2IWG=yapNTzdH-gJy;97zh+m0Y;-tlVi&_;sCz1nr|oK8fi3&N%eCtlq3Fp0=r(%TXUQ(vb0(bDHBR-j05X zVz$N^dJ%N2Mje39hD}I}?7(KEUl$H&6h*u36vI0Hem7|MjQ!ttI zlW?NXg^xD5e4L0)?2?>LQy$bzKsNc}<$0>iKAeWDn-pm1xY02e0uW%*3>#QUSP}0Q zLn{;8dL%*zmnBIr$AWyj<(qsWC(9CxMm>YFv@;YD=)t861_s}Zfo~X}y_Ayk;BjJ# zB^XUSzK1RM~IOXfz^ z+{m81fi?emx7@If+iCgc9sW02BVMPYF3wLt`T=_xlKP~;zP-3eInqHinl5}TAk?VE zzP4FGama5-MOvJ}<`$3ri{lq=aYw)72hhxAq4ht2I8Cb1wP6&O>fR#>kKyj z{9gE3wi<_pk(kiMk;|j8U!p`0hOsGyw!7}7X!h42UF`}=$a+Td$L28W1~u=KC4V-z zozYi7L?*SySiHH~0CD;?!%9orFyax04$V^r=HQKbmRtyT$}0VM>~7BcY^xU8+1nka zz8;pLj0vjige2U}d8z@Z39 z!`CK8XR&NnAarmt9V@KMN;<$#_U$pFkA2d;jzIJ6HOxp&>zgy8V&V3PllOrBJm~Frm zr32?gl?`-E_U$SU6=GwRnpe1;zn%RaDk+&uQh&R)m@GQ}B>Ja9j-;`8x&hP{hC>XF z2iQcy;eV77FsUvKzYz%!2TK|>uw2FH9K$({?^%r#RVuPkcS5eMR-Ma+$)03XW~1B# zC+i|VI?IEKwP?DJ4<>;v(V&jkq}mwEz)+=;^r^49?PViYOPP01kownb z`;hz;-4ERY(0_?MzC}AZd%JZ`*+0+5NnQ?RZJ+8rDv6g(>$a-YeXjcMG~E|8hP0U; zDT0QRPNa8>QuI1W!^saSrL+4lwAlItij=3peTUdv2SPKeW5%G^EoDbApuN`MNfICL{n0UW%DVxi{I$?t46C=pWY1po0gkst z(a6SBXVKJq@^di6nL1b^!HVwkdlK1@8 zPOeLOEKLvDEA}hhGj&C`pJ+8dN&kdes4q(~0h@@i#w6TeeS%OVKvG=Kc{!01EZ)#@ z+W38T*u7|3w^{m@A)_}^Q_QMF#2R840f%m=0*Jmv98*Ta@fmC@y804xUwK)6qb`p< z>V3+X*>N)efxR_nOM9;rE6H@tXc)l~`C$b{69QFN&GwRYSqa91W}$R__ab$t5z;Icz!a+sGD^8Xw%nI%=qvkO85^eik$d7fJB#_3s(FZy<_e?n161 zMbT%8bC0^ELG`Xv1Sj)8oitNJuQUD|de=*ggmsV#PoYE$J5PN~JBvnxZEl2lnhqM4 zk01D>U#+NCxy}n~?YSiAwh=*nW4Uo+m?96U3+GFop|^ETXl_ijf^ zk6G4&NoZ$n4f_brtIzdKUj?c(LGZ+2K4EQl;hZf}>dcR<)_1HAmPE?}+@XaCEkx9c zAjTUmNwEYA@PR3i!6zO@98L&^DE^81qh4waPoFKv* z!9|-cVFLYDCu3x8f_Gmzu(s?8>|wzC_s$mYGDGACjj>#;Ib(gRkAN9!b+;_VAG9aF z)kPBd@ODu5lfhRWMH{XwLHGDqR7?hmPCDou2Ut`nL@9>dDq%1_?*V?-%JZ zB^}`%Q1k=I)n6P2%cVz30-o1;YLSh{MoiUGV#4+hIGBz_sYFUpY^l%DSB}h3Yj|90 zH=EHjit2|msU1R(?#pi<#tsOC3Z9bx9p>nJ^4uZnmI-uEM}Rd)hA1%tWTMW2mH?HR zQz)tgJ);qldCjRDnp|dBL)G!znEp7gh<$~s#b%BvW=Ck6ZjONKUx|&BB8B!U(;2NP z3xBR*b+R}4LD&a?RffbfB$J;-u z#ofE|a#YFTOt$f6gKX+*_7xxuw}T}akyF94XQ)ERSuq(@4J7U#AU~$H<^=NT-w9~_ zn#{H1=+pDlmgVr!ptqE1kYq~E#4Hd3)Vx*d9^g~`loQNSy$C0 zAelx=bcQ}{JWUvtKY(D@RbWm=+q=2R{xQ$hjpzMw_X8^f-;nv#1%G!LEkFd(giE2F z1`k#@6Vdjh`kvPyV#t?Z;wY8I!%-l;1btoN6_G|Y{WDHG(gwNdL*QARo<-kD9952SFo}2bzS)ZsmfQ+qKkw!JNVgJ(CXxu`()i{VuJkbyRK0Xg^=5Y^=*X4sHU^uX!tQT;<9(*}xa zoqa=^X)li1er`0J7O*m(`i^wI8$&mZ*7(#FUPn1cB;!c*edGnrls&a1rC>8@ktTt+ z+lK6YaK$LG^t$u{|JzYvZ9DlS82Jg9j5m7-1?p4qdO1qWuIZ!0idkF37eT_DtK9qM z>0kbpK|N>J=~6S9d>|)Ks1`WH`I(vGm-hP1P9tKueSGvP9>umT+-^TB!;LIzrh#3N zc1hg)`a8h@asf*a+P$=Tf{3K3di6!f4AoS9uVYT?Uf$s>e!WlTUpy18>3MS`l;N}f z^^Pb#*qY5-l0rY6jbc(yffMU0l65`YJw2w?YT6V8W)~BzUyqpcH!O&lax)p;H{ky- zuR$KP9m%VAj}w)+mJt`Adu}adL@vPbYBQoEI&PvNQ?uRb1Iw4HTQ}OBXX*NBUR&Wi8rX4Yyc_rVbSgkBmLrTfCOPrk~X`5 z*)_ul(xYtghRG4ckiUSrs-M1G-Yuno3U>KbSmCkEQI;F~MW}nBTvq`{YAq$;$OMG8 zFN$j;$KagJ^`fY3vs*06wZD(qyEU9T?jykPK~VkzFE9Knm-)=-N#4YcYa+@@GJfphaeme^?{hQX9Z$;`W6MW85a;|12t!bN=#jM_t{Fby!UjzRJ3jZu zay-DQ(s<17e7bl?xj`kVE%JBUUR-A@kA_P8rW&x9CaFcO?n03I@%0Hb82Zcy^}WQx zw_|<;>a8C)k#0Os&b&mRi!?4?dlaHi?u#B`VZL%?4r15Z;`F(FA8zh^U44rf)K`Dx zEx6qG>#J_eU*K)l^|6exVCR&s;0ZBkIn#bs?>d32iio@LN%uZ}kyCkSPFwEnP|eF; z{eZgP@^K<=wv5D>YxNu;d;t7npu0)FM#|W|u;#+J?U1;OB(50}L*8;VoY4+wEnM?& zy}A0*y&Q>;bZy?LJ*4@LNxMY4(*fnL9F^@f&)LCVRd>sKzYFOn*N;PaIQ!{ z{CuALbrt@7Ti@r!1L$c$8S>v$UTErb$d7tXEsPZLc2;bf4K|S`nSQ@t8{p;7;p{v% zE;-dRVTva|naL<(pzi`VYbdr;k$k8eTTlZ37U4n~Z>et0sBn2>d?J6$;wbq6L(=GL z#bXuG^z3ZjvbQ=C*zp7|PLX=J!bgv9{wTeqo<;I$86U5Burr9_x|R92wFOH$Z}= z{000Jpb@tIWnLm=H5PGfb2V+~>9p^%RJOHH=9e&)r?8a-hO4hnTCyK|HLPb3$WrUO zJdThc9KV6BjT}Y5eq{*y^Gy)ug0%w{G^=SE|J=mzPT`DvdXB8PE_+RD&I zYzI>Vv5)!z?N+sHo_ycC?i}x9CJ(npaqR%3FY8sMT#4k8IfOSHhL-D|g=|yUe4LMy z8o%^p*?(7h6!W=l&x!t>XjA17{dwU@4g)K?FW|AgAx#=1ad&UPOli`LWIdGYqtxc* zb)tar?a}3O6cmA$x6jah`2y=&GG?OwuYhp>)wsa_n{YaQ+|Vw1LaUUql&S1RQlnvS zZq(_Xb)N4h0b9c})jf~TyTUG|uF@DH9{}I~M@B*aGa(?hKWWWv9TDSCg_`U~b(MK< z_VK@CSILwJJ=PhmCBks~9w!jyveSN~1%{Guf=Fg)nHsh&B-%@x>YS}t*Lf0m-+59u zE-~wZ%)Oxoqo+T=#tsDmdjEmr-+%V_|1~v1-;47=#vz)S`}{D%6~h^6`{*nA4j*my zj6GF&+yTyZqbE$`>lHupPZsGrmq!V1;+covA^iN1soa#HTz$d8mtrPe0tFf-i1~SX ze`qF+){^IBH9^oK?qEk*`1Xq;%!>O9qj&BX|3@PMJw5!rLntSv5 zq1QSZpEL+OG#W&5f}?|24qX$x3my2jON-!yG%cJl`xGO-aI=xSkIUd^QB=3^h|~`(R5H1 zL3t=XiV(GLOGXNUuYDsoQ4y+12wNPW;TR< zJsW~gxpzRZJL?_UC;6@-5{FwG0^)QAA`o)GLAg&{`N!)yMe-E58~%!-KUb?iSq5Ib zq?}uE%|Tkk0((m1%sr)bG3>ZpJN)c1GzL_Fx+s@{`h*e%(2cZd0TD@fv&&ylu_mo5 z@gJY57!KOWFyx<1FOp+qqV_+Hq5DMp0KF8N4Ae!c6X5g|c`yYL-DO!Ndu-G(2HK6# zxR(B1HYr(V`O&*}eYZ~BRYM}6At6Q$L;k{2S|{9#4zr-o#mUYz}Acq2h%L@F9u4Q^`dUx7m-R(hoKBxDufxejE>5~7%t7R`L$jZX z76;Eq@eYN7%hm4%-t`{lMrqeCxZp$6yH(KQpJ;H(d9Vnxsu*r@wpnz%$oN%twu0`p z4~-*L(lIY;=wnCRa-}3r^dnfnj9v@}T5q+U54C(Wu&|u-@{)U4{gTohWiR8tmVRc% zyc5_b4Tu4|^-P!sHJiqWwuegc0L&$@g|uc1V#AY04b}Z_dD|DD2UbW#ZGJYwzC1{`$W)SvhW{)TF%h&s< z*}%#|#VUNTKHY^FkLjxgsdW@pXcsf|M)Hojt{wyBY2Qw(sHR+PI*& zRUX38qw)#1eONveUoTCon;6nk`?;khul=^2nTPxZj+gd!^w8hTM3>HTzLd?SX|KOt z2s+4mcwAz)E|?s#Rvc=SVE5zq;u0<3Ls-cbE2n~yGzoXcC)qO0=~48_)TjD%=o_Hc z8ZsFoR$@s%9l_`d)bI_{B^e~{=T)C`i3Le6Dn@Kq-)HTnO7F&>N1QIDTZ7FX5im(O z-wakt$aJLsb_X&}O+syc1*N2{xsyMA%hfJgDk<@&MFVO+IzA|JEA`mkt`|`gBJTDQ zH6&r67pdM3^r~|3>o46FsJ42#(gqZY(&ck9>Pi_swB1PJPJ{f9Kh% z0#e*pLv;K{v{IH(@~xXri%WV9NiHXCT$XFHPSPcgorjcy?qVBPdM72&?4V`fs&W!m z@HZl`xKU10vHJ2LJwzYJk}w0^wL zFw_gl#$F>9f6m|Ounfz)a;n4T!dGeGoLfpZhLcja=_1u=lNjD^d5SL>NYd=XHVY-8 z#h!sG-%tHUa@o};tk~UnV*LKd;|i5P&mNYv=C&HlK{p*a183FsfQSS6myn1thW|Gr zapCPc`KM9s7R~EPm3VZa%-45fyFq7z`oHjG8fa$#ppFwUqi>hF$6+q5g)`iC`hjVL z(&m6BJym|6aQd+L??k04(whPjKxraUlp+X7lMX6fLPC+=1O!x6 zx}btcuM#=}Do7`Y5KuvC0vaAAiL-z2%zD>*=bm-voSAdieeas{rz;pX&$G*Se?Ki9 zJy(u>c)uNeaC;E0^)UMAN33UHQk(OZcQs~g289ec7-jK!!y%N)X*kXYIX5wg6Z?sza}JkY8Nd<0qK~~8!bb8v$X1-pO*1&>m#3{6 zagnF#Yu{+rI^7ylL`Q4`a2J7h7hy309ab5xa(U(y?S0@YJ4Thcrk}35wuZ-I?_U?Z zG~>sqdju34LCdJSg@-%Th^zg$BOwUkb-ZMK_pMipFPUHMU)U~62=hB$H_+-DI?#Hi zhyzLXranPO5*7N8M;bE)(RWCQJ}s=EPcqiU_l0;&`HP0Kc)3&2^~{!bMFxE*c7L$I z7P&DWbTntMK8x&_v1w@T4BP>ozO~OQ#QUTX4nmv8J{6mYy3A9}=f`x|Oe;x29MGe+r#BP!9)egu%jO zq<|VOqw^@~QqIc5vb@3Gu&lOu?_=NgcupwSHRs{l%qkyTTs<3&NW<29rgchFBEZCr zI}dFvBUrt47$2Pp?9lKOZfvUJ3RH^YJo(~R>OsY)%=-gMpWvm`3NWgimHi%vokTHS zsNw|$)!Eh>yX1op_S+s3cJEowHtZKpBMcr55u}G#6ANR1I&+}q$1!4`p@}O~YN8tY ziN^VTD)$uRrHttfx9ZGySlKCgXV(APaJ8s8D9=J(%=^!WrnH5ZN7L|&eGxl9lN)6M z8&5B=R|O<>USP_z`BEIJ_4?`u9+5m)T(5I|SjrCt{7bM{J!S@xoIXPGt+-OedRwd;)WC_|J_~vvq1mD6MAJ!w^9%ohmS#)#$0pD%d zLc^hV`jvubz2MQ*2dEo0T$oV}jAko|X<+9FMbBBl=XvG4(Tjn%s#DG?Ckx7Rf|?(| zPtP7s*8w^MGdHp)iSOls-P&Tq3rYYDmSU8}3{@N?eXS z+X66{>480XrpKtyw1rC`+g_Yt<6`1vZ_PTNz=-m`UEzC`!{Q+(u4fpUWdn|KKB-89 z-7(MX;0{@FPPnt*J7Uhe{zz@0zVw>6(uU0_?V;bUz8gP_;ineqFouXV+e+A1+>OlG zwu((EG1U}utD+cAY2ON~7}2h&^5JZ_Z7gx`wSi%_eo|31UFgman8*ysSID^r8IdZM zZTqXwt=J|}E_;&>ZEY8g)!eJ@%x`m(;`xoxLCS<-h$<45qk~3U`f6lsO6+vAkT8N z=@gn>b!K`$@$$s7a{a0E-=y^lJzncIehY6Qf4#eN@#opGCaTV}D_N zN#mK}^WGB2f&_#1Zf~!`Rq2enarGP(1Qfosf3cx#Q2>4JxqW!4spA=KInBa4Jm*(* z_q)5w3imG@O}~&I_~6SII;!;^`e@Ox_22Xvr4nJ2A4aki{SE{XG1V`H^!xrZ2y5wE0HmNVLqihRH@m z3Z9d#9=ri#aTe7`4Y8iQvLW#YRvf?B^L0Kr9k+VVf1y^#GDBB5;y~VM z0W-sgYCH$iYvoeVe&t;_R@RVDkEK^wS2LM zP9a%KAa|Phl!x69hOtYp_OKPF`Do_&i-mi$RO;gAZ_%gC=S<1Bo#!G6xr4oTp}4B> zZqKRh5X-3NuLe`!+szCce44k2h-};4UIo4gDmqBo@b@JYK5zin5Dk?spEc`nbM?a! zkga}rdyf+T?o`q4i>D3n?eN#%qPfYj)ZC-cPm(mDq7T%%%P2NnZ?)-xo?pW^>cakx zYYzRkm;-I|Ia=KsT=GKzq0O_~3+rVb_w*=agh~xjV3%sUcVe=N@w@BpHPcPN$%eQucqU}GOMsaJ+;hv~1oWUTtnMVvmRPgK%n!FsTt!TAC@IWk#C zLc_QG59~%JM?TU8xkfX(wmLGnquqh6{B){Ogpk#YFjBP=G+x zmjK#bv7&o-#u%kHSBX{=HX*EuoSagHN4=fyfd#5MHo;_f{jE>51#DT_-m zkw>rb>#wXj^FEBl@b1MeVtL{`mNT0akpJizCPF+X8jl3!XsD#onOGlIn8Tm8!n z=9X)^zyljC4+Xga(@a3nYi$hG2PqE}gw?ZM|Ls-*1JW%GY7WkQ zkpnaIRD^^`L~E@C9)Ys7&jD{Y?-BW$Kq5Qq1J9%$& zh2Ot-yDawb>29XjvWy>eg$HLT{Nq^9!Z?~Jbhdb-EwA2#u=dS(CxnsF`(loX5+U50 zQU670N%WaB67Y2D6O=oah4w*A6-r-mMnI!|ccw=B{0-DtTS82-vS(kuTauq*m{)1} z{Ao#MC0qCG=l??KEnKa`gfK;d5es~O-<%PebY@pvFe2CcUA=GiWfhy>DiAQBT;>_4 zD|F?bBhR4Q8^(O3{pJ1OKZm;E%<5(DxSANOaF{)^51;n*dRtw4DD|E3Ilp6h4zAYu z62^U;4PW4EKL3T?>c2xI_uv0V;L8egmJTbKTcGhm&T3`!7w@@kL|o~;gmIszW84}e zkDPynD4z&br6X_slYkTSAK1Yeuu4GXPO@E5=wB2KPrNkUI0~4{rtN3^c?aiIE6+te zU1*Lzr|`fcUV=A84h#QM^i1qw9%Sp23fa{j@O zgVS5nk%=(v4@Y1h^eazAXc?OG(7pUS)kOcC&EsF+@^A6Jf8bwqU;e;=kfMea%{omJ z3lDSnsAlG131uzYX=^-l@a|DLuKj@V2A6WC+|_0FxO>R6mGJ5RH-98{#w7zy!(INvBTeIa_f%f5 zoK~g#8S+os{%_f%xKq22HyCu`S71SAO1g};s0i2%OLLC6AB|Dy7nlBQ0XgCS-qy#=EiJ-|Cc8nLuH4H zj08OW+-q5w@l`-XF8Y|;D2P6cGhpr{ljB?K*M^2yqV>q)$D<|w^}_$nWBn)X#{cs> z+02EZNAulItTPmkJ9N!s!R^(TEG{v=J?<@xihTszMa%{+XZ97QJJ&1R?UbR|wAL3o z&Ak6oHQiLQfKJ*jPdIKN;$G!Yr>Nu74~zW!93B3j4sfI(ZQe|eH>Y@7{t^OYnu9v1r?fquWN@j*My4Zh>4eVcAu$bXTX@WdD6wo98hWLAG>zEy7^ z@@EmN>VNch=tTSb`0{oa94};@V^IZUYL*m66Y1B7qR*JeBY8Wx0T-E!;>3vgSS_yE zr@j~M*q!fS8#p1P@k@kJz}@9-5l1Juy&UACxU`V90e(kAjF1=$%7@VE} zhb@a6SFEi|p_Qh+7wQ3Uqz zr}uBj^$b2m*F(}3b?(lb=U&04Vq}5088b#6Gb|=S^#9QO?U&H&Y?}r>5p(f+!Q
mAu>1d~c1!p1@=t5HH3Nj}PS42i^y}ELW9Cjj|I81j`aX2eh&&CRz>bv_%sHGwt z|F+IaXorC7$((brB7MRzOiPak?5HjndLAvSGpso-SsxMk=!7COD)=`r5+K?F!}n6! zaf;c;4&fEt>|P&+d90E9ru@(O)>W@6vYte8%h|RQpVlgsd-Fx_STI##KH?h8V^BA+ zi813Ra-BnmwyR5!3c_xBAurI(Lw#0n)^PVn**J8sUVU`r(36pdw&S%&0|ej0qMi`j0eKNPF*6xJIZKh>+CPSj zV09K2Lfek?J83wDwrSQW4)vb+0(`t4=!jz$F%q3n=}WX2HJ`@pjJBNmAxc_*lya>> zJ#<1GFr$xsN?BBR@FB(SNPB`nbRgz~b2I%SkB;O5ni=XR4jd40ea7P6{tX8Y1(!Rm z&6Q#Ut=+K-@=WilQtUK<0{>O?X>0}x5rRy0X6o=D_L`6mdFAWKetgvsgCTzliIGkl z_)_F?ibVy{p-JO`Eyg&rl|m}S!0tkp@m;NEVrQ!Fv-d$lJf_LxijuikcHt&(qP5e; zm_q(_fmj15I7AC`{9>1oZ3b8hz_A9c zM4YWp8Iu26m$AHetvw#yUhrBwkJE6k>Vfqvzp(C2jUyZdETmK!?rRrR?30hb`&9aM#`r?v z<;JE+k%5|%N=dnouCwrdO)!Upha^wjq3~Q@4om6KAh`ir6j#R?AgKTBWAo@PjV)!4 zefT@(x@Sys%-gKxm2dZ7%Z=t4j?tk8|G@CJD~V|s9qG-|1F2WJzROyy9&bqe$$=|Z zOD2v~y2gl#!Ym~m9=y!nJ;^}NrF$Bv0IjGkDA#mTebV4Sjp*7UFGgfNY{=z#)T17) z$bu8dQy1#`g=KkdX6-uMpIaRPKBU{IW~M;|KSh~EtN&}y zczWAf;`%#S7v5+njf<`i_=MMD0e>?^vwVNEzBoMWTqD$8?h_XJyC!AZO098XvwG3J zu7f3A@YXRFQS00|DRGHyaJ=#ANMjaIwdyYkg-G-@sI(NA6o^$nmnwYjU{G<6C+lrj z`|&KEfDAX2i+&={u53F+M@Q9=+E+XibPl7D)13+%h2L%L{Y&2a24vROc;>LupS*eN z&77W}Xz3|Kx*|k2nlli*_LI~iiX>vd46zI(MxCPH@qNJfr=A zUmD;@d%y&P_iKYAj5dug-0~!elLH{*;A+?P$i5~brR_R+=;nV96VBdZB2cK`|$X#-p$USuF<{M z_c9C9dj$qzNGIAj8P}Llfk^xjIG1Qee3nE0E^X#(Gwp_ujHY9g<9Z!(JxIpo>zTLL zu5;@mPf>1=j3_+9WZjPd(oUr*LhlJXNi+cm$iFdN@%zm#abw!2n%P z-v{<{k^4LQGJ<^_&+@^=|H43|l8kn7wz}D30x*dyK)%LqA(j?Fgc~2`9Y71MAJbT; zOG)(2AAQ2qH~q^zQ)D_?1#M5>)vpmL6PR=Va0O-u|54FPI}Yf%sj8{|*q+}&VBnpf zB+>llb*pKTl(Tgfvkmf`E|MREFKomP6hTl#YuXwBF5$xqM4u(eop^+*b6)~hanFg0@1dIoykU)P zTJyXSiid?Cd?8A#dBHznT{H3O2jnM^c;L#Lpi514Gr=3b8lvpAJY?aY44)s(Rx~+g zf5z>!biyYlm<5v{3Dvzww8#`go6h)Y!%0TnSVRYtx1!m}buFIOj?VZjhoGvg;o)J^ z6Ivzouk`N$2LSG+{Xug)7IZtdY|CgPiw&d6!8HQ*;dR~HKg@2AkDnI3ncd(ileKMi zJx|@piZFa{9z$dtrSPbbi=m1G6~Ln6D<+bE37bvrsGSLUf1aW)~9d-!p&-bAyQj<#Th;lT54~a zu6+7cX){vgK~GP5OH8Bj{b)O`G8Wh>Dj*YIzr4Tsom5E3djjSUP3qC5df(ug&teDK z66-k+>2DU@9~dwGv=JjHIG{DgAQwiMq43J@_wGStg^W(liJuRad|NssU7pQ1_$al^ zCyOoQei?)ZDUOQW&;c_YH=9a+IF0izjJ*uu`s~d*#H7b+o{cxkR9_infA?I-M5MAT zJ0{zJo_l|a2a3R-|gVXM}K zE~P&EwDEhPkFUx83fE`z5cNazeMuLyxo7FwbH0Nhqtq1*3J*8gkGK>2(-aVw2WAGH zj!*iR)laAbot`}VLf;EZ&QJ3F`tRtz)C$tsk#vt>yH#nd;j2ry%{{e>K2h~3*~=vP zX7#2={$2BGW$bK9hLYw>S$8FepXz$hGt$El4^bWp_{F7skngdGR9j3D`cUkTT|o#8 z1^J4;DwOQbme~#z$i8yjr9zt;ed(|`2bprw8Kcy7EtvwJ)#~Le;c%z-H3FhGdZ`QWn6`@7`e`K0Jg30 zjo*Cv%TcVB#OY?Qe9h38zH+&f9CTNYWwB`9i@K7fi!5{|niq=s>cFTEXj*HTl|!cr z*|r-U4$dyP%zKL8)PhUatDn94#tS}%TFSv7UVp^i7#Zb1<*bKb`Q$GT3BRk;pdt8sqM0i`7xrr=2D1USNc#{z<*7{ktl>AFVz5{oRt2+YUDs@#3h?6bl`Sf z+uxvGScmIrDKdNUnZNdaCi$u2J1ke<7}NVgP7KIk#r14;1Q4GL+*+idsR>1`-dtYSV${EKP-t)R;Rs#kvIj6w7J@-cG!}0+-{%w$>Vx3< zwYyFG80n5!_p^dR8yDCRlVG}IEttM4^zI}&^LIoYgrP%;`_ULpTyNl!SH;+f)xqoX zPQSYro@XeQ&j?#CNrSKsKPc{i9y5XCMQcdk)j3AP+iHGGKUMpI9Ht{8EI9Vn;p=0O zI9Q0>QcBv?0DRBqU?LB^vZqp#b?HW%i4#qoEx}(smb! z;b<(=bkw}fWY|i79(^Qc`CGISYN>3@CkNB}lsmh-mR>V2S#rMv>7HvuGw`Z}K^_OH zAh`B8bc^60)`M*h+3HNimR|kjP3$H72u@fizAWO>@mc>lD`deGaiD_f@6?2jkw*HE zBHFh|ufyjrlcaLgpP)Ii{oV779x>BbefEXNFe){KY^>9*V(Zn)xUk(9F(_x{Z(hL_ z$%vAUX#?C3h9bmm@E~Gfi?yqLMP0}rtZt)Jj2;ki$NYhP<<~%wO&SBKN1sNkf_BM6 zwrXH^tSx)UX2+U|srUdr_&WvmMMo6_WHAj2edc(dJ+c`!zhaDXHllb=?%SK?C@bC; zMfta7L0rmh43?_5mImGd;e&xyAAArOfQrR67)68aGJ>QqBt(uebzKf;wWERo zfZM=dY+Z+yvsQauSY9>ZNZD!*^7RAy8CJh@G11@DbH&cwM8b+sY4E{*pzuxOtfP?3 z%;pYzPk?n(h6M_5`f$N~X&5;9Z|WY6rDKX^hhd_y-O@j)sGh=lWM3f9`u31pX?Vhp zMVrf%YpQv3m^fD%}5XsuD`q}g^s1=^BY5W?>A(N8Xfs_*+Ol%<5XWqc&tT(3b4WHpN%G1LPX z9*mm2E%(zLc(P_qb!Z(|PYHO%Wtf?py|jpY%-PM59rW}USiPWcr(yaBcE5E4NtUjm zW%cTcRUBNT^|m1g5dw1*C(!>>3rt+(QeDeC9a4R|)-*k}QG3e9)|T}?mv~$QAuhTI z7NKz$>y5y_1P)Kl1cf+MV|CucCZ`t|4}xOrSm}YH@t4X6G?@V1i|>Sucs!*)KxG;+F20cz}qn#)iX^Vsf{R1hxqe zt?0}FGMKWc0PQ& z_6rWf?b%-LRUy5Fb z`}uilIbJmfu5Ms6r0;-oR-z*Hp&VjCf2UL%iMOKnjVr5}v{g%1*^TD=pQP`<>dOH{ zvEi~OCD-6-Fgmg#Q95RNdz}j85v&EP%8nOR*U&?setP=1x<-NEhrJCN3o~)Qdy%12 zg~AWD5)%y?iTi14s3QWNjpLU73B_NB=s4VT8CSkF1l-!IDpChnjq_HlKZWxk3iAOt z1F3qQQs`Op54QcnChw?d?ibZ+@`Fj0ZXZ*O^(?o$w*Dx-N6*)j^%F3KId!Wf?}kp- zA@FN6Y0O)M$ekpx!Q;HBuCgW`se{e7zdbrUBC-wZAEr>T6Gt$rbQ{jZ3nEaeyGl@ zEWprG@#O8p^uS=#{eSJ${_P&pf7CDkzdoZ|a&B_o+*-!sBMIMPh!ch+wT!INn&cl; z(>~MN5(|;fdftE26V$!=2c{VE7S`aSTq+fPc3r@wDM2eTJX@y3A}OipWTgO%Zj4SB z@&9u~OecWafx59MLYqV}2M5wYaO<%K>bXlK9-)ppU+(ujQanwqNiQChWHoO;;b4i0 zj#xX)WPV^l;UO6fjLsE0LrMh4a~B$>8%?9sWdhWg-A%21BikbigN|!O+wj2;M~I*P z63u~IGLKRFtDs>-^Ab0lTkBgUJ2Z;%-py8X?W%}VmhoQV&l%> zXkuQdZ_|m8g2)ixiqux*+V#9a^a*LGeT=CLGY!s$OpG?g0FndE9%f%n_VS9}CjpM8 z)@BzXO7p^>MxIcDWk|ElHs&PAD36R@gGFiRBBe1C&b5I_oyXB8uc<|yFp9ny9C~s+ zt6%b6ib47BlKVFIOX`_s&->?QIr3hqxdZzm1p$zS;UdZPS==ii48{@`&SvHZReIZW zEog?h_ZJ=ACzR@}l*1yWx3EM6ph6=ye<}22GH=*lwEvW^&Ry`{{;F%NxnxYJn~-r= z6EocVua^8(D3CC+$VAa1;_il=xSqFn{-a;;Mq*5IdRYQ}dI}#bSMriF-JpxEm>^Mw z>_wUZt(?T=#ew&cHe&&*D6z-io5(ruW^Q4ApmuUgt~fCf+kx-!R{z1-@KNBPqQul& zbI9kBTtYctv;T9!5byF7IM1!&Dh&gX^mb)7bj#A7u2|MG_AX#QBoYIh<2q4|rtj4c zj1n()?3$lkOX0J7n8=awVy*9{^(8F}rnjBEKu8j}IJZ3=$bg1J8w5GGZl!ly&+y?1 z+^cw5u094YiNng5^i3MF9j;4^F?IgclL5hVJ}BVc58@yH$DQQ2bINw3T~W%Q*?94s zcg0=LQzJ;n2v*jzjgk~b`#cy=l*l@NXY3OTxNfS$J_eq`m037OG zoQours0Hk!y^D8o+?`^J5Z|h`M(xN81DTd&VXv&a7*m0{Tt*A6SB8enQa~-5iK=57 zw8MFUST%3loWrtCdQo_eO=HH4-Rr5bUzZuu#r+e9vqVK*Q*SFjXN&Ls5?v&c*sm~~ zPj01_QIsN4gQMv*!5o{3>R!n0+_+n|=t<$GkVUnLBsllih$O{~u9jXH%WI}F9TY^j zg5<^YkI_Ed!BmtQEgMrL%hsgUy~}~$R-@@H@j%Xp>fEf?e?@$8=jak0+}t3pIzR@* zbKN+0ZIckUpyyEJMN`@5RgRyTZkk96PT6HdR_aXBO@(h50Q&V(;SngT_xEO-)bkhL zZ9{D|9~;EYT{C5s2zoUxEb~s_dR#6r>w-S?5&|i_wmnDJe%lUWVNgO3YN0#i`T1%^ zr+1svkt5iS(Vu`MP2r{{(vDzmh5NU0ldIqfnStSY5cInaxTm{`0%q zGUMV8o3%w8y(j^nX?HFi=v*z53w3-9rIagM$xm z{z`rV%}7i?s?IZ22ZrV+VS8#)@ELJ0GI`c*xlO3)w^Xx)uT~$A$$Dg0dMsmbt?YV6 zBn%-5MI1=c7Jyy1-|vc8oabVi6H`0U;OCe8(C0OMG(Qf4Ue-N_x0b{ghmG7N0`Sbo zqtyVmdWdoz)ahQx#J`I5G@8O}IIb@=Y|j}QSSAi}Y@QcyP8m>5dTwpiwWE~AfG+}Z z0s9nQ^IjVrRVWKsJ7`^b8t&J)5M(tWFVTGTLYBEszHl8MeE?muAibxE5bXPQCj++o z!Vd(ZFrIdVTB0o&MQBKxo!iL&IK)xSUL<@)D>ckT>NA`vjH&dmNV=QUER=^o6WNZ) zJ{`yS3cr{XG9UkDM(swXXW_|Ld@_TUTpr|YiR|q`;ihOrD9C#|g}Z?jb-*m}gFj(@ z{Wml5&2+44Lj(7FGD)#HL;L=2$D3CDpl_Z6%hAffqfV3h#;@~~FGdAoCdKt+L#DQh zz=)7U@181cES}!y-UHdVhiSQAazA4gQN&^13(|E^z)KRlymPnn1Z@V(O*s=)0SRq| zunYHo7K!gDJ?)PDtYR3W7hWPqd;Qw1)lye1{y-P=*OIRWVk$AHm$2oa*Zi>)>6rZU zw1VeD`Dd1zCP~)E-QD8FzlD5@*2()TwayhaS32li@07R_r82156o*wv4%vHQFDeyL zekJk7CzEgeV!~$=jNZVa^Ez2+*H9@hB%hNY9N3y?tDbgK|$;?YCV$dBF z$u^O=hoBp$aRR85%@oEh}%zzKb&9++&ZGV_@Un@qTgn>PeYmZBSOrHMZwaH$DTs;KPBz-PgZ3xpko zdeq8JLC^%`uk^_*UB`!vuWPDTX6>jqpDe#Qa?f7f=2)~iWOo}B{@zo#8->sbWFO+N zM)#r^ZR{yWY7bZ6shSYhkT6_FgMhGL-ke0vQTFaK{iS=GU5}%q;)p6sxVtD%&}qD5 z-ix3w8|XE&iIA5*`abODYH*IRE<8wS?KJFU2H-2?Q5+6LFdwnZItt;b%Qc(6BrR~# zq206`eZ|D4$Ec4le7LsBc!D!O@{AMbn36@m2@l;HqAXbv$}1GICEugI#E8#A69dzn zFTAE7-Oh4ziT7ZS6LI^>7FuHXLZm+X-EEjoUl9kqZxIfHs@j(q@9L;;gz{TK64pWp>0{_mz(`$(?Fr@K7v&xb2`0a#~f(I_s61tmp;7EtkU@XCf?9UVR)I z5)k!*Ygm9SKhl&K-`pp9Zn@%aF|mK6!KSTy+W-BN-j_V?Dc27qt`86EXX{yM3Bl%w zf$5ZEB!pgKzYYUQ{z;8DKihZSt@xhztaVSS;3ve7#vb1Eg5RG)c_Zfb&d&(<+7ANS zz@a&xq;DtWJV^{MOoZcNGgv9X{c&P ztv$Rv=8Xi~A6UE2dm!ZVQ)6Yf8D=*(eG|L;VO{MJYib0a@mE7Z8yl;0htAO7(iNbo z0BV~;qULR%#Ut9j#ZbVo;R%u7O~t$=#=X#$x4pN>f~s`3T{vEhAjQ1-(FS3(4}l1O zq~(wEq76?Lw<-p#NnCNYcq{_kR#ml)R-753GYvmTVX486?2W#;b+3hTDK-P!gw9Q+ zh&1iNv$?m;%V9xIa=?L;0Ycb@c|I1BKNj9bgaX*qt#qSJ zIw`my3adMPDnw3s8>RV~{DB}9>-7iLk$vps$To)z{A;xl`76;Mw-v(Q!vMuC6_~^9 zpit9vbBFdxmkiX_^)E?WU&}bzpTFk)vi<2aczWM6nZsAQolxTijzsBH5GCmST*aaB zTUMn(1T0tgRD$ut{F`iLX<6M)gc8B&SOeQfQqes?NBpa09CDG4tY&BZ4eo3G05|`E`Z`b_kmb~ON1M%=D{lQ^5j%>WVgxLNH|Hv6YYB4{K(oS5kPXb?=4Fv*^>{ z4wy%Ms%=u;Pk0YsNsI|$zeMP>aB$clzuIeEs|DTZ4!5;dRk}5yrxnlE+WbG)SFQ6+ z3q{@Ky0+kWZMgPaAj=8QY$I1*-YZ;o_h9soU|&K4Gmwm`3hZ;?gLY8;(t#Q4Wm{i% zL&tzw2Pb;}>Muh$zn1HIv*y|sj1{q+5@;PL$3?1wxIZX_NcML*=8=D4@_3uwgCDlL zvs>C?90(q|k7G;&D58z|FEuaRrq={WYrgh_?kB2uXsHqMAQTilEKLw-kd@9h5ja_r z8Z40=4&(SiH-y@c~ zvmm6oL-*!I1svBN_1zf59~c~c3A#m6|KNX)Fq5f@;C7f-G+8v6bZt9QE66IH*H)Y#HExJdCO;xc}s;_gVb5;q8$z0WD>ZuV{2rtGFx?WT9{&X+2T z3Uc%+9ixY#XN2zt)9oWR3tsv>+S&X_oUohzDO;^i+v^Rze9h7E;`n>j1f{$%*Q?1t zr9r6jB88{4&q>YEc`3g)5Dx7)5F1z1DV9=&lG8bnavkxk4(D>dw&k+~=;1G^qyQp{ z=oBfCmf5_VNZkkU#?XDbpJGqP&H7XZD2NnD*JA$|lic|JZ>xx_sL6Z=88 z7W|)Fi~YMI^*{ahp|28+Xi`|>6$D<|5~=(h!#}3_GtgXldR!l+hRTISRs8$P`5zt> z-6&d*$aEKT0@ARCqE;3}eA*bB5pz46@5c3|GH?0?rsvM&GM(%q>GF3d5#I23cM6ZA z#R(&l_2a=>0;(IV9ui6Zgft_eLlLK~;ZMVh*fsw#gq_ih_RM_We76V@hg!-+?&?#( zwam>I`(5-6ECSJ+8I5<|Vm-i6yos6VYV1a|BX3mf+NaghPvUot#iyh)Ry`W5HpNe} z)v4bWxE?QlOBGmQG_P*@6G3d(=C4Xnb677y6v-M$XS>I1b>fwzv(A}-k~rT4w>{Yl zuS8s8F13ILejM-s=Uv7zg4LsQO6TVzw^qLs{QJpl<~NJj0?tWf$E11a zYFjWBV&49Ny;kT(aiWFCbYv(#M9OCkQecLk+svzgM&X0l*Zskrem=#bVM^&)4;O>@ z`sT+qS9KKR0D=&AWXgQc6E;_}v$VbY1$#uF*OY z0GExR4T%X-i)hG-K2%1%GYDVO5pHYws4YjBZqF9B1JJEk>ywU>s`>H)Sb*Tk`DR#* z?%6^zM3OHAEu!Jy<~^A)p;%&yLwKQe&ZQ*0;{}O^M%R6Q!!v@ZUl(GzpYj>PMXtkB z{tC#3Bm**u`%BS34&ywPrjtb}MvMj;6TP5J7U~D!N z(8TcU9BXe%P7uBG# zb(z#NzkIP7pBUF6w_Gs)tAVE@PuO=s-(jOH)q1!gFaqw#rDfC3o z&0bVAMw#{@FW-rA`rD-EgrtzcyO;Tj+1W;UlAxU_pui^J9`crH;xlLs#Rkyck)1gv zdd1|4K^vXNIsnZfxgu$`^YQ1AJ0lOJ)0+u8w=Yx*bO|?$WYYn>saxh}y3YhKspvD0 z$3HEX{8|&+VMQhzbZ+{}Twjl?dwAS)@MSG4}*1#3#B zM^Fn1NcMH1yS4Me*SFyW<6qX|`npH8v+ET%b!9MPIfvK(m3YhJrVBO`?80OnOeZ8oLHPx838k?AkHi}}T20BeDY}>DIlLl_&3K(m5 zw7RM<{TRA2as+VJxjyg95cQvZ2Fth=WlTyF88aI;EHBDV1fO*5EzRkf|4xZj`d(1 z+T>NZ&^F1sdVBQ^N#)u)3)FqeHD9sihrva$Njd=Q&{t1yfmg^)xd}NE3OR?WAQjyi zmlooWulwRXy9)-ZHE^omO^0T^{h$mlxGmWYK+`#xF)SH$@9|Cd>GsoM=*RLeG7*-Db5DX?l@eep9Uxnn;W){& zx8?{w98MmB!8*G`S<~8>Moo`Khs6!s%YRkwNjjd+=9d9LrsTTf~vST9jA{Pd?#WJTYEgHIogk zH}390J>ltnoKf#)x`4cCgsbuO`-j4>>-H>JFwNo`D!L0}u>urf7pB3~mMY3w!h&AN zdCk9uw|=kw4rIL!Ujk3V|6lti!xP1i?6nSUImRDO< zzVfxUw(a<&u&d{!2ze55u?UgqD1P8>x}kFh%qsVFkO0N-rTI?$V9y&L!EGu3@rz?~ z*Ur5ucjrtMwyAc17;|cl*z&z{4Va@`*W|1l7)1E@@~b=aYo2H-3)Ec^p2SuJA2K{AXFNlaGMcx_EIhl10G zso>N?P9raYdT(E^`P7)9^c#vQp>$zlJY$T3U4aRO#EUg}c=w&a1npx4GtQ9wMuUQj zlOfDoA$|E?pTi_u zwzPNeFncQXN@qs^uL&g5>!93fz{KXouzbhyW7NTdRBGwS`_`*pJE-YqNzWGak zRxF5)P+qji#~1)ZGp6j9>S@?r&czfh39J@r={&h)P4aX3#dVJP-jdf+7nDWlrGsXOIHh zMa`!!F%1@`{DE=maH91gY0o0ZPZaG*=}G!K)id|AKK+^=@7t^pUU~nT{=~uryal#Q z2_i+(SR9DrMlDd{%FZFQ76}{wqnrH1=B#*lHhwN5lwPwfS|xw(s8iW>A=nm8l@d0F z0VLDzv)X~A%iWyj0gp10$orwU)!pkKn+aXhAB2aFIk3S#m5Hz|M{D6`ImYoLWEAAh zUTKm3?7h<9{^xa1EF~fx7H+ilUU3}dkDLp$YnUg$Ob=x}p|qc7R_P#xI#D*bqRURi z4`IQJxkI{&yPU4kQ(GX~6|5Om1dJ6us9x|*T$*|)KC{PM`1^jb*_)X}H5m^!Wp%ZN zn7XryW-#=7u*H>3O247r&&FGX&W7h$G^bL>kLLgiIp9@pru<{|)PDmyqDnQm&26wd z?e+0#@;wQB;d3%MB8vBgWdCqc53>Y3C|*<tO{{-PW6LT@eAG)=SN_CQeM9Vg^ z%~p`ia}Mi5)tPf1b8OrvRsC1}cZZJKeOQ?9^jm@5qg(h}4e9@|tRBKm4$;2TuFw%Z zDV!@bW9Kx`-S7{jiZ>TAquj8+z?BT|M#BLR{sKPwtS5~HG%loUN&GYd@^@%`5kd{h zB(B&)V3(q=C2l7Z+W9)BPH77lZ}1W2!;G6o*s|l{*`JN`eKJhCc60$CYJOmi79&+` z?k{a2u8?T)rQvZs__IyP&%N~>JZup z;w33q5t&kd6mnku{#&fZB`D~bWBEI!PjWe0MsJv{y503}Vv{VG(y-lN#eNxA9X()Q zcmAzt!K4LdxC@9t3a?m7h+El0T;yNr`aO!@S&_wgJRYk3R6~-o1S2a%ld*Ng z)|qXrryQzlom^+&S#=F^{md!o$CoZ-moPJ6m&U~imJ0we>6iY%viAYycPugpCfB5-TI%O3VaAqZ7mc}obCw>y@!q)yNxbC9wcTqz zseP7s(a9%Y$^NQ+Z=j3z-iY{J1LGC8)l7Y7t_X0=qO7%#5g&5uT9aa^i882FZKasAP}{klfZwFi@L*{pU#@^6U` zoo6r~<`IJJ1wxXS+RK0)-pnA5+e=9M&ZYd9pXU~5zmxQ{B+EORHEKjoChrzmdsw{e zg2Pi_8@L;&UnUw`OrWgUp}zD6O!YCQ$3BY0+m!>8VJ2|L0Pcla#wF+Ph$a7Gjr*3WabtsOIEbB#rnGtYElXpZyj^mKx{c62b6w z()Ryg@7?2}47>i(kt8XTbB02)ce{u>y1V+zna=S|Y;fz9FB z5;hFv$W?{ED2evI$JDGg^5;^2Uz1|}&4KY0nWja#7)_gn&F3qwd^gzCLoJn`J44H2 z1az0F(q=1(Td&a5zHGGe(`;8#`sJB!32G3;owvze%gd%TT}ef`B%t)TDJra&@0ds< zF&w~YrzTJ!p^RpR>sO&XCsmpfK;O+eajP(rtM>Ee+5++De!Hkr(_Ry4xm&{F@vpW9 zCZNvxUPD!~3-x!x?u>NyD}6pf3znzQ4lA&Za-Nl2u`9`Ye9A-QXY_M??bqU@jjB3S zd5K{5h!KhqZ}Fd?c}rZywdgP$FEC;-XW^t#s%mwScy0{w^}vsrL*eQEp#?+yudw^B zINeU(>D7~o!sjus;5pe5%cM}pU2tB8Z(>y%KmB~jqn|v)A{<;kKei zTVvD>BLU}MgMqG`b`{d9^x;_YNOIb4x=M}NY$@9c$L-#_huWvC*Hw1hmlEH*P=A@@ z(?X@8fN3LY$rF}qVLbsxI5TP;HQm1us7qu3=R5Hi`w4k=8CuF!;G9y4!R8fL5c}n8 z4-*c-FVbg3OEP29o;n}0o3XTeQ*0v}xA)vm!4hk&*%nYX*u&DEWgG$goLHceE2%BO z5HpbO14&AFkb|yAa`ZR4)m*A1`ZBchZyr%H>kws8IxagUW<8m9Ro))5cCJ zb=fTT>7=dtQr-*nd0K74z*uG z~m#0!_59*zzxnef#X_%IU;N?HXf0#d?y$hy|YE5949g+7i1dzry}1dpnc( zf|GVqNsj&I^V<_pAbimcUH@HNGkKrv5-Wt+$~r;&`8gNPFeRR7^SG0HLZZ>x|8YMS zd;PkXxT2~kcK|Z_q7u zXVA{TJ7#KnAxQm0@{kpe%<7@~6dM_CjL);yO)-GCkt>V+ICt(B(G(DY`wzVNurtz* zSHbD=vqduX=4MEgdy~%AbRxonG|01R$q-HX&ooqu8+pQ-+%R! z_dWT0le-Ls&1JTPr{4?H`2bRy-2PEeaIll{D!3iN4y06qKsY=FNYAevu%u!M< z>q%)0R^JlwrAg-^>QGJMkEbX)21BSIeuQeG{U&SHfa*&14>--RRR7k0Ww|->+hp?9 zNHG!J#Kw#138wPE8+(D*2OO*XcqRDW8{kpO-yiMJRn6f6uA)-+%suh(mhF*N_Z+^Z zJ`uLw6@=RAv>{MkUzoKA4xm|99rP8Xq=W%Ku0T^hnW9YS(yNn=n;5`<|8>j%MtvsA z+ed`vP`M|!^I-GV2B!oqp+hC{!LOt@%RIE;Zbp9NnYWSpA^EP%?i&PwdMDilwS@XC zP}AOfuIur*EPXlgaI1ux~6e&I*Mo=+FjOr4)sRJh5;Klu?< z(Wsfe8IiSGu%8BP<9>DX# zUNq2EQisyAVF2X&Hl>tx7Eq76d7hndF=;IodlIf0s^n3@#j!)Tm7C}a(N5!fXAQaq zZHr(dsm{0t*+^ch6=P=5L~rZa6IMrNBoBZz)mdaKtU_y|%tw`-8Y6A;wNh5Gqu)2^ zZ?3TxJoMwS;byv^R*dc@&*f{MTeQ+Q1BM1L!TSVRIP7<$C1nOy1b&BAwC*Gf+?MaR zRPdV64VC?=#*F-;U%)YB!5zu6XGcRW=ooEQxe^>r8|59mV=!59o6WY8@F`X}hmfSvnrqF}d^lr!=_Ywpaj8qHkf>5D-aen*; z(j8--T)r-&Di4;;IeqI-BCT!e*>?-d3m0ESHAj@Zhx;4|?~bHzF2}%}RajoU)N5bv zil^p`tM@bN+H7p^Ei_Lk>S#$!T)n&cB=%c?8m;EZZ!WEyV6y{R@gwS^X%&x>7jC7a=S!#suU?1X54dfx&5=z^R{QH zMDQs=#SqPn@9!pTI#Q9&hpia?&@)#^`_*(lpEg;Qs@LEB+$+=Z$Ka6tkME59y0&i- zvI!z#w^aAK7D);=qwL}85fvWR0d`Ee49r{658gT?1Lnqse(h*U@Z7sCshYps%Ej|Z z#Oh;^0-lP!+|uViVol{@V&L8Lqs4*B-u|<^(N;tKYg#`GKgXy-s@E4UdAnGcZO=Io ze41M>Q2#xw$~?&0#wH>i@>N7szw3%mUif=YN4Fa{G7WAQG^wh-OAJ15Vjon$AyDsl zpk_-HmYX1nzeRfk>Uvc8Adjh+GdgV^jL%duJ}9^dmcj%!*Lk&^K;_Z}!79c^Nplx~ z7%*>TcYjM*!J?q8t(n@JZmQ%}HD`c@=kP!A+4P;KQA>?uY)zge=+4k-Q`am!247eq z@kYo*u3RX4y8TGQpje7~tjk7ea0FKzXqX4OrX&6>q5bbeZs1j+ti$n#Q*YSztK=0B zY(jKjzE8Y6KsAf| z9{silBqwXIVdZEzdt}fIcs{WOvD8zX*2iKP9aMqJpYKdQDhvxMz84c>f|aTly?6JiFz}83B+E!5)SY?Qota$&Jk8F;DWq zeG37V#;LuJ7~c*xT)yrt5eMqYB4gAdL=@uBZ09o&HsrBM`Q~8Iw zKD2Hg@6Qby8bunn`%G^g-*jbrNltX&rk`9NP|YYGA`imDQiP>mm{G_fj$1#{*wHn? zHZ^<3lYKZjUL_xQlihjeN4=OPh<}@nidI7#Sk|=KmizjVFgWoTZ5X6@)( zM=dC$Ftiy^$uoe262md@!F-@BY=N@q0(EP}>}}ZXSelc|I;x=n1yyGQ5(v*Val%e$ zqz=Hb@vbNUAzLHghnLh?1^}at2C=c-t-!G1Be9>8r`+J6b<}+cEMk<-`LEae*XsRC z_Wt)d8l02l0V0C~_N1AQoT>*QCENN)mHkT|r`uE#&y@tbhP?WAS#~q!s6?}`mh3~p z`kjI3mQ&nDow}y)-G-#vFD-f&nti&Sn}M)CemaJUI*^X;&Bnre%;y$8-8Kq@ZMe{E`_>bHo=soOrAb1Vk$8FRH)iZ#VxNu(d$IMg9 zzMO;CM2p%Ds_fpp^!OSlgGn}gcwq83HF6t{X40~HSjrBmmzZbo0cjjX0N(hMe1j$; zN}y@dZki9(l$OZ2O@qRgsAqAKcGtv{zbH4Ee4J)%e{rx`%(7VN(&rr)kL*j|ehL-v zpA|^{tK8`XGlVOV9(jI!wW2 zRAIE-qVZ>1VHSYIU3eRM4x4OIuY7cf>#T$9zVk81er?_X+Hv{)e=djY@92HO@}*C* zGB1JTtUGyt_|EqY=d!>***@o_5PN+XpmA1+6Y~txTEZ&QH!4bb_dYt~`4{CI!_->t zb{P<-Lj^SdJtB^$fJiP#a!xd%6WMn=uuBwA17Y$=JaOT=b}>6fV|dNX#Q@{+Hz&0H za}V4<@ht!M9?b#M$68iTfL!X{X^^b(if+L@vj-)_QH#Gp7s;cb@9XQ(UBW!|8{bV0 ziFX$Aim?h4zosd=No(|vWM6tDd0^*Q9N@+3sm2swsBcF-Hq_(pj}1ONIDAy@!f^G| zd;8FrjlSStt42)kRHjE`Z5o7EiRSahS%A;nMQR;&#De)4s`|+;B2SHg9C(%-_<Fo z2gEe0AYCc_O~5zozkggCQUZj1vV++Q)ri$Yt>^TAVrZYq9;7u*Wh@{Z$^}2-fg9*> zU>&tM&7Q)JZXF{}?Zezg!}h~PJ#0x4;C4Oe20t2>p!biV)g@#JL*L{zyd;eNoJM8x z;IF{-bVGYMHJxQof9V^E<2BE$zOvhm(fuyXwPbPg_PW&kiTaiN=rSjH?D^nYr5n;a z!n5}(5|bvioENSK*mIfg)xObVr}~5J3W>NQT(Hy_v;(#RC)Bn)_T>+UKKpy7DoOj^ zpD)tf*SoYcaz?j$9KFdmn=wHrJ}7`h1_U(r$E?FLvd$;Kvv)G@*b{; z3qN!Z@>`uX9q3$sd+z+j*CoUg?!~Hou71%I`EyTsO*5lE@QGQ(-SpLsoKeH-B7&@w z>|#X8SGuWQ9HI)ozHr2Ead)LgXpUG(X5MhbR9JExSz@;VUGB=mWr^03C%0;V;3lo};M&g4OhE z+ttL+=f31|PLMos@YutfANhk#7^K{naI8IZ(5iH~B^#=STAl<g2}cpF z!I)b|Wu&%2v=ne7jr(DHSQ@O=l%e~~Ys7^Qi_ndhHLWk8qKU^J^ZU^M`8M_+|FR}I z3MUMCkRRv^Nvm{WT^%&e6pP07Ki)~}3O;y9KsC`brfbHTWo@ik!?qRr;gDMD{-Q19 zn^WZ5xz6i^SnIB|c)AOflWxw4O=J*1!G^Th(M(cEL9gz?O&`yW85fkC{dp>O(Ns|X z^u&COV#D`*U*}%$@V6k}!OGE%-1@V2)JjB>in9}w{tu^dVIB47L>WK-Uo2IPtYdIC z(~Tux(B@5r7=ayyc8b&+9VZip(0mt;*POVJ_-SCNioagBCpXdSl;q@g+S2r=09(H=BEZ11VsTP` zJ%9J0f3j}+ua4)l`WiBZ@B_O|7yS+%WilSN&tGhhml}zxKD=eV#OO3nalKUw#i7r0 zN9(sdi>j)`(+)RE?fXOBwk!*)i@XmQr1e;L>K>uZR+TVFUX#PuZ1+SDc9sg`8=Zs1 z2CfBex{w%%S|a7?8D^xTzhgswi!U)MO@dhKXH4aJJzdlU={>E)hR#WaB^zsru(e1@ z-!X1_!Qb_7JlnFCLsSp-0{Uq8==&~W{1nAYQP{j6D7XIQ(}^{J@0JuHeGoPSOnXuF z1dcB2Q{8iNd1n5}#W6R7%|2y72}W16tS&Xo&w^ql8(+}QVB$-DFdgfO9V+gQMc#3~#;2LuQQ--4T8xUTC-hl<59Az2)@9{yGazZs$ z81OCPVAE`ndvv$_X&vd(s9m%24S{wEukm`Jjr)7!t><+>wiMI)+jrdruQ@{V(RZX1 z>el_H?3u|opmu40;YiDUX>Gbs*#6r0*)L-ZTjbsslpMRMQqc)z7+2WRkRy;{?^ zw4VFr%E-XbAzJy9(NyzTZRIlU=OsU6&-&hN_{iBiIxI$dQGJtYe`-g!rKvh0YrL;M z>cZAkj5UWPTI3^qum=BpF6@8t$69RJ;pO{Tm*Hp<%beCzJHO+|C~xfM z!RhYqM8iv5??{jLpdvo-`8CC#PiPU3qytp&e0&la}&34 zPf(Sal!HZ7z7I>dg*QTV#$G+S_ ze{)~IyN_Z4lh1%Vh5SGTf9VbWPTQ}MkFH5C!m7KE4B#X0ZyXp4?t6dohM_grk)B5# zCkEF_r6;$+FM>x;eoeDG{$OnHuvJ$qQmwG?cTGr7Hj8U5!Tl&<|nbjQS%SG5=_BDB3mXo5UYa-0xl9w+B%ATPkxzHe|va! zM>nY4WZ$mCv&?m&%(Fi<$4o@L<+srRGGsz}=sHcoRE zIiZV#hsR@Iklk^shhrmfjQxW^Qwh&vEQl?_F@n=Bb zuse4h<){iqDI#5%*53}2LdS=fFl#U7HNGBWw!+bF}0o5f6t0z0yGdPt7M*r&V zmOl`aCZ?|xF%KKXu_HxrYr=(umP+!1Le{KGGx2;9;FwY&{tU>%o%NHi8I}M>JhG(1 zls5#me!twuHHQ%+bC%h%?zaK<@Xy#|r25#DKTp*aWpskqSAymaUIC$0^#GrZ;O^^! zLXEQ?0nQ)RlQj%34`YEpLYG0z*HK^|9|O?Hqf9(b3TpU>9m73}Rn$Z_EVIR?&##It z?)nRU#2rHD8D7{8z*yz@G>^o~tW9I$y$m#qM zYqsAkc1^C_AJpQGo7=TP(C(Fd6v*HoX0D_LQa_ za(qb-ha`OeQOcgd^cc?0-zP4Zl%dyAmxrDJSS7FL7k7_T0E%#rP={GC*^iym3t)AV zrN^LEz&tmQy{wNrypHl+l^IeTM<4hT@Lwd}$1OllQ`m_`V3<$t3_c(a@i=xF2rW-9 z#I8W8`U9j7dWJs%1AhwZDK3gUI|l5c_9+F>G#Uw1Tz-nJj(v%}PhUsP zpNDEynp0*CMIL!dUO0va%j#4IfR>jY*$aN_sM?flZ0W-MiZ+fCJ09!8ehpUS z2O=ln5my!HcQ(Dgl=HhVU}`l#!#=euJl z4ClrjtIm~?hj_%!mhmUXp^yGO062fR#f3CfV>((7R-`Xde5gDW=DZ+lZ_h+m0acsU zYj3vaQFrat&7|V%?_ROqXYF(X?U()wl9>&1rG<7$`R2ywu&#?!df&Hi$+P<>(BHc=Z?|Y^3URc;@^y~VfDlGXre0t0i~AP|`+}IYy1NmhhmMZz ztCig3sIz?=>LSY;9;RiC+zn6L1!s>1V5=DkMZFtqU!~oZI%ZKPZhA2O`8%&+A&xP9 zF>pI15*i5dJx;AoaO~lyi(hv~&RTZtSI)hV_5d_4R|f>Jr_z0DQ2|<8TYnr)`9FyS z_x>Rg1Pw60Fz^1(A?DIGy*)B43jZ=)o|QXa`k_PVYN`J%f0LPV+Yi4|SNML{^@F3b5F(~p(7L>Ot|Ir zipK5D$~V+cX}jCmJX?}>-`~49D_1%2Rt&Y~p8NkmG5g;Z2P5eEE+S_?0QXJ%0#IPO z=Q!3PfEe7Xr^6cHoOF$)%m_ByJE5yc|8GnAqbqk;hRI3M^^LaZpFu_`m=+24bl z${4j~M-@YBTz>V9X`(b0#H2n5s80@nyS}s{Z*0MiyM(W!B*};IH&oc81k$3;(}kFe ze+GKQzE2)c#|MClqay627w(HQZb1nE-@({;#s%z(ub<4!un^Qxm<{zjTt}@bXLoCZ zBr_-6ss@`IDOZ7(UY}s$9zx@umhITJ6Z)2F>nOjJrUkvCa{I-qzgR1L+>9F_fI@)* zvBu?)6ePq02ii`uBKNK}ExL4Ffqpejt`gmPI@2QP8o*Xw#eU&M8e74IQ~Gq3PXk;{ zmY@!lZLSJ#GN09qHbwf_#Gfalx!8r+@hA*)s|_@D3g=LXT~NnS;-@FD^H@1TSkIhs zCuTbySRIM$s7`2(2uy~|y*SZ&FvVE7XF}%dcWmGn=(yi`!f|}VUm`Lb6kOp7E4&;a zf_c@akh48~G!@pv350vKX89Jh5(d0C!=-&>2mQxz2cr}`IR_R78(PwWM3B-($*Y3h zOSrYOnzvO}Y*=KqyIpY5!jV6ALn6`zEqEd-paYaIcIoxLH9nEwR?%;6%2)Zt`@zD6 zEw`>QMb^j&{l!vNUV5_ zAb3qWfLW(fud&j|^eZzhzgVU0?)F@j5HpAJPC#r zEOV0)D|_ijHv4PJOKcQ832atRHa>pPIvt<>XCpM`K*ii1^Ran$$P#J72lwRX@O)u_ z`P?sH#3gugBO3#Vj$X@I)x^SjR+@Y@nkk|}eqQUZi$y?kK5@L|I}4ao33mLmX!ia+ zGwkh%$zK<}LCt6X(9)H?=_(yO979E`!g zYNK1YZ3IPd#aF)^grg`~k-6$r?P`nE6Q9$5X!1MM7Jmy5LydD=c5FN#v$K`k?ti#O z{@YH}Advw007 z)-7m6V?2LluRd5Ger0%fL*=oZb`Lg(iN!rF;CWY_gU_kPZlQuRKW25VH zjj_QJ^G59$TM?Ocr z%X!R?dN+1^>)VUybi6Xt(B2J~Kfv+^{b8lNXq|mL%}CN6=Vafb_B$?f*a$bBIQqz; zY~J#jWGiG?k9*g7hZ?1m2efImv(su>+_cDEbQN#gZ#y7MllwShxM^UBWPBU?BBLUDgr z1~>144$wn2s$6o4$XRxTZEMznDr1Z*@-CP8%3cKpUB!$YrhrMEe!H}n~J)@v71slp9(2e^-B##Uluhc~H{ zLiG1_Z;?yz%u?+iHksTK^i=b!sAtmVq0AtM$`6~l(+OSV!sDq-rRg;O?!wqDQdg^M zp0-c>oQ)d0aeWu)xZb}zuBq_W-EE)iZBcY4aEVq%=Oac-=o-MU?wzl5JV0~EN!a1t zmzg}kJSrt$Srv>Z|>Z71HhNk7t1IF2}U|2r*Jhx;;o)bgPzqnySrFewTL}6 zhYqP4PkgAjb1UXu%`T;fTOQuxpbPg@TwP8Yth2>`$6Pb_9qwRhqw9)wtQG zF;8e+*nGkHiCLWBj*A*npLb#cSjuqa7+ILL7xblLnZYveeI?Ge^;7G^PPGpWIaF>` z-02%(66D6?d|p!`W&U;rcZ&>TiT$F??2<;Lz9dpDt=h>_-L7gJaL-3p)xltskFmhO z;nUbp&KO(Nib@}jkp*UOW~6kOAND^{(&l|ie8NLLK>C+af|;q9kBnlVMqTg4ob#p% zBX0mb*&-9T>R(OCu2?2FM72ACV%7y>)yeLgS)#XrNXo$89?YXV>?i}St-Mz{O+_Rk zf6NI?PAJ@{@n5VPEJZ-izB{v64rC)tKIWi!0X7?HHawMZaAI#;Zsa@u%IKt9Wq0-& z7ov>lI<@`Ab5xqj1-fGp#r3@!dBap-pDV3};e5a#GQNsb$Uvo|?iQbZ#IdNK0)_@$ zX*FLI&0|Nn=h{XgX?KsT+S|GHo;RnRGVIHtw*IjO_TpCWfgLh0Grn-s}GeC{~7N>aVHA( z)qGsVUJq5Y`gEa5*h%bSsp-BG3AgwT8ekM*&=nMy%d%)wgY|g`--Z1sZ=gc3>p?G7n#Lu)xEc~_IYTA zjG6GjgZ-AbjU|rPa$MP(dEw6G6QQWJ^8b{>4SkW`Qa{XzG!N&egq6$s&i!b*UUD_{ zii^v&cO656dY|?R`G;HF&B&7t6yCA;a66h0Q>;QY-=BhsqCyvm!p)Sv48dcKZ%_89 z@Qj;&uuZ9SB>X)1-3w&kY#x^WCCG3sXTQ8rj=ARN-Us{5Is&`c|Ll{Cp9cg=m2OZj z$kBp=`c16GEN&HWEDnW$jhE{t$W1-Hj@tl2G=lJ0O}h;VhD>Je`05B)tXYbXdKg7AZI_13am^s?l;DLtDmyl2QQeD)o?b1Ue{ z@oua0n)s3JXd}3fd5OivPAfa(HZmXSs8H77Z8p2HlM*R)Xmnh?)KvD`K*mvyLBWSE zy8e-<#bNl~v<>cL3M?I!YoP?E=ey8-Wr=MN?+yg;Bo#Zmmj7(#F^j^a=rCcDwsW79;rFp zG@^L#26T(OJNfd-93K?A0P7UH`kRP@p$NR+sXPO5+;-ryng?Tqk)2aSLBcMWuil}1 zNx~|>NaJFez;U}<5ihW^17gqK+%#C+G4kvpXVW_BMoS2G6Omy>FUrjI)j9GZ@kqzE zx>z5sM;2!8zEa0{CEjouHhc{+-n!$VIBJ5x$vQ>%te%JEjEFVSXWguAT3LJF9eyIa zsWs(o!Yp5vHE$MxHRwo&BM&S%8hoQnVkB3FX?CXL;RD%rHRW5YoQXfL zwjUP5?9>ER@}-1caz3VN9)o-@-&fS3%l*QwgdAGnA8l#)IWI4=-|U`8hTFyy4H+vp zgyUYU5-xhhZT>g@mUR@I@(y%!Ou|cK&nBw%$0Q1sNY#9sczWjAPtC!I@w{;3gnRPx zpnHtq-(#)`{kILf|L{rfvHDo-gd!SXNtv-oVyyw@%_Ecr_cKB}v;e#;LE!kE?uM3P zY*DOv7{FArW7X@Z<#A#+S*isZcd}y{Gtis>X4JV4`U_}(T`+(5$12uSL3rAL!Hpgx zu_0ITR|ffV19{xexD_!NBLVCU8iNw_L)p2|c)$K0sJ9dxfOP8|{1Xba0k{_r0ZVFy z2zG6cJMH8ma@;qT4t%>h5KS182M18~GfCm+pK-Lzdv%Z#-1Wmu2`2!@F}sa`k^Kz| z%dBFqfOqlrASJ>yhKjdNRP) z&X`a&UwMls%M7?rwCMi9@J67k^Q-!p2#BEoTtllaCUz5meqR!E$cF>0tdY5L=n3Qb z@0CV2onuT4Q7 zbS?h?t1WXL5tB}{L>3oZrUD2x0Y?CRW9%xHZHzQ7Vb{(k(ed73ehurWXu?_DoH3ZU zXUQegdfeq($Y0z8X<;S~YKWphOi8uj3FQDHSj~p-xDn`?xi!ELdnN}f zYb#)JPh2LYQ#@BCT9=_UGo4%WD{mkNG^v~qYTQl3vZMT5kc4{NCyc%K636NUa|hJ8 z^Anit^2(wu%Rd_IvO7>9q|R0R0DEja)|V8gTk!`)_)gdcH6qt=jMD>P6bh+V6N4B* zj72s&>nnB;xz4;%Ua<_g3`Sn!7TgHH)g?u%0#>K=boBmKXq-C>>Uo7-Ly@{&LHUt> zID)QUM|}a_n-KXhE|<^X&mb=WDsc^nEwhZh7~FAL`oePAHsP9WCat7#qx)4CJ+K4QKqM}tO;`<5p$wsM3Nf|;fWFNP_Z-u+37BRSa2N7{$^CaSQ8^}Y z1uivKuo+FFepq04T;}&63Ui?JsLtT##|C=&dyf$PP>U+CJg(Z3R?X0Z{a^p6oEV-> zRvuNXli51&_tUq@Kip9W_$wCN?dSdub;Sw9&QhM4wQ9j#Nu(LauI?g$9nUKyFU1T6 zs$!izw@yDz0sVCu!F0Wef?I>G`Hr=&-w$1O`_-vRdeI+pyJxF0=-$!+=Dh!7VJZ6| z-bb&7M>hG^v}(n-L!d$3!0`)yQXe(f4yL1JVCJYY?lce&bNEuO+%x?b6lkm4qM5{Y zs|_A28;ofm(+y`NxN`yA2}m?>MzoYUGL3w))#F7dZ!GWh+N0OIYEncklon0XUKJ#K zp6m|PCPxo17_opWf4Tg(5G7b}ewVBDf#5_L;AvGAKwa6)i;N8Ulihqy@moo zF&>=IT>+M7+x1qXD^8>1cE0B>Ibq2590L`vNuV+8Qw-t4lT!;pC;0;p0o9KgjRhG&Js(wXFfF8wqoXD0*7rAWO7$rFk4mE2LRFf~SqVdB3pkggRs^D4l-B;>^GoG(6oqGzmkb>ohdeew&Mji$jdzLHU81W1wqrEW^xb2>q_su#O zFY7n($bWwFU0_FCoLKKtBMN1@bAthQGXcZ?fOS`i62A_(QV%jk9DexvEv_Vs$CQqm zC?P#O6IaeXI(|4lcuW4IUH(w2`E5BO?|9p4&VXlfZ0hz>8`iQmS4N4}^h07Vct5Dxx;ODR9& zX7^`OsF!$o?QT^#S6s-DGeOI4+Pmqm#J8e{Jt7V-;bmf)k3QaN8ZysPZ@yT!)O+}|Dc#W8?jeTjII4|F2O6_Ba% zAY9rvcA<^M<3c&*R5zqc-@>ET!so`h@;;|_djov=MW4;NeL4bJ6ZN2D2$6b(9>u7k z_sp%1*_n$Y4JA+rOUmbUXnA=VU9Vt8#4Cb7`3T>{-76Pgp;fe`R1Kt~Xs>O6J`1*T zEPL+|R*m6Tkev((p5uT~+!O;raM1QGNy(V8A6+lB+w5fb#l^03zvrtni)s-&BRKTi z>K;pJfi^V1H*-tSB{#}^IPj0u`K-1TZy7mF^oQT2KH*=C5rig%E;Z!!>Rmc$LKL(h3#XKil5G2jPKo0 zqgVqxisXCg=yXIMG~R^-q08&2a7SI*;e5XA7qj|XI)5qs8r{f^+ih^fiX-X%hMZR$ zUTrtn9jHly<!t)vMc+h^(n`%n=K+sKON71AX-lmezaZJ3$!^ z((6u& z@sn|DI8Os9?3@=cn@BVgQtOXlivYf~LMdbOwk76sJFD{GuV254e)|0RQ=GBTL4`ff zdui`bvWXQN0{@0!I1q{6zp zKWq-7b^BYLeVb*@>9LbCR`O&=fY07^5CbV0_b_C%u~Y8&^LZ*S&$@r!Kzw}EThS>& z4;5+%i=$GH1l_^GisbO|(l{4K+B45$3bB zl&QK}Z_W4K4(Tmw7bIQ>y#x{w9;~~!Cv)GuVvYws`g=O}+}GQ(F@#UASOzKJU^x60 z>HZJi%HI}E`|p1JKd)bt_F|d>>#&q%4(tpa8(?5*feI2b1w_|k(ZI+{fNWVlKv~5{ zd(ttzy5LW;ztF7Bvx<%SZ~hqgOJrEynDq!7LtbVBYg4hG9km;L!2@Hs${9l*&!`1v zJYupY!6*bG;|eDM^ME>x1Ucp{xP-8HfdO}L1Y)8tfa97(s#I-R6&uCYgA^8tC}7AF zvsvGNmSsHtGt>n3;{2)t1_RA`!~*Ma-bR!a1#HQY*ztg5G$s{rdcYM8w@fh;h@DCv zhY5RtA^YMA$bYf3o?}S>XC#vFk=3_sgJXVNMO^y5m*rpLPtnPU`TS9k0CNYs^!{hL z;csjI9B9@-w*1yi9zpCx$8C3Id}itZV5#tSA{oebru(;t*R+bj5zK)V#eGB#Jc(N5k$rnd}F9|T`mL+|n!tdZ^KDC6&*{OdW z=)nl6F%vFD%GFxf`-1~@OTn(jNk0cVC=N)^2C_{COLI*I#$u3VYt~s-AE7|&&nSFR z2v0V6CF2_KuDaKtHMxGWJc(uFm(L^v$Le+_vI8+;{3gIY)iUbCj!@3qQ?a86n0G=f5DT2F&-p2g!k-N}#oPcjTEfhA)TIE$RfnFR z_llWRoF!8y2s@ryxsD3Lxb!>=aHCC9fvS{`pj9KaQS*~tjxFF=k*g%kq~_O<|fmBYiCIYb@E_N(s&(Mb+lP! zlsniP0E|QS@bmY(in`VMaxxQCJ{pvE?K!{a*5-yC4Kjr&Q>FqQOP>tGdZ7D7YaMsd ztiEJUm2?dpe)87Qt;5jWZ(GOVOG%xw8Ir75bOUK1s$3j2as=LmDdW3X?TDuR%6dPu zLhOTFI+g}Z_rJf|_qM-uF7klQhsbX?HMf`_F%Y_g4P0c|vhV9^(Nt@@i~XcC0!F?Z z?n+qT4t3{+Wg<<=dK>sk<)SrSphgY>Jer8039ZCJ>|E*y6AA`hcCHXb;kx1KJvjvd zw~v@!4Ryb08iPH!J3$M@y~R**(9O*;fD=D>oMrFmUT zY>Z*4jdNLP@nM;!M5p61Hsd}9FE-tM=%Te#5|q>lx>pYEr>V@5vs|S$XhkFn^^AJJ zQLB$#*RK?OI_Jh=b+F-Gg>*fLtDRzAN181O9m8BdX`#{HT%_=&)Pk-%rZ*&Wl(n;E zjCP?Yjv8LiZVsRgj-Y6sTu47S-=bB(Z|V0H9mCCm_FDcmekYGM%!;2ARI`nL=pk_M zf^+n#aeRQQxO3;=rU#k+_L)5UVG;cE6TA6DUPNI#Ko zD)v?2zzl$hK}Itt1yBd-@AT!SET;srgh{>$qfNDDUbjW!vrhTajK)7LN*>EOf6L&< zO$AP>HVV#1cmANAGDUI^3pWl5*VNvua?#V4J z=>m?*VVIKA(%-Qt+ka8#q%>o;STfSvhrX(c zxipO`s%O39j?Sf59p3&9;-XJo8-iZkxfxE~vE{?p_tx%*=47u1s*~N6bw;hJT})ZW zYx?}K>1ZsEJ3*3e?^EQrsZ^}Eq|1X+xNlzIlf@CH9d+6LE^2%ZG_WqAWTyJ6?NSqRN^npQWVzw0TwhB>wD0@iV0^sI=ff4amh4@{W35U$UPpeXmkl_DU%kS!tEza5`5<6uWjsgWpVONd@7OPqEqLp$0yC99 zYg>9x(1rmi6`MA$`lkcw(XSUzdVTwBb25_S0(_?DEg)EoA{Lla&oHfEpvXsxt77|9 zYY8GnbTyoFXW^Fs=g1w@2uwxd|H1jp=QeoCr4 zncrPRTJ-b%rVi?>r*=(!nQpJQ*QJ+o%ElHr>3G$m?hbzdEJ3S`?T*u+i6yLYob( z3OimWe{U`uNQ`)Wey>=y%8<-F+7>lIb~oRMcL7MiMPD;U8SGChwtyotE=&_ImUUbg zo=6)qm-F4>5iK=lDOs6p@bJa`O?AgdwnDK)5%M)0IQ^94rL=j_hpuUq#0BGf^)qv3 zq<2PM_@%y?8kSaM<+XaE-BdRIWnfU0@blOi}LFLd~w~y*f z)YGT+i5HxX$q%3c^xPMq%lH&TlSVft9m@xN&TJ`&unS!pr#B$im?!HQY_ zPtWe|TWfXv(&x?BESmx$fyIfP7o}*#0mP4u=GP-5J+`IhK1g>YNuZzkDpq|k_-RmT zL$QRa+vg-$$lYG?fquYU`vU?NQ5U8uaeb4Bg@voQk88Q@qqsVbUElilys-{`&FRC) z)B8yt-e2WYAoMf8!!Kw%E}({wmI^}jy38&Zf4HsbSV)x%bz<1RQftlm^()}TzUR1w zm}=LdT3u;2Ew*9wdwumuUc6kaX>Bu5d>;g)HtkHp#UIkZeW)P>_nKkhfk@29`p;V{o~&H>#Lh~9LM&h$#pah=%1Y*Va3Jh!nf zdI-@d#F_-iAm{C`WOFQm?&(Fq@Xc57=159ew7Q+6mSR=%LXMBTJXB;30k?7h3P!7k zcV$A+6Y%=^gnV2YYLCQ>{m8J6HO4DfrIC%}v}%3NDo*c$IsUDQj5o!TngU>?YxrFQ zWCO4}i->skkHpKl`rqrTR0sV00zT3|-^n(Wc#)O<>E?{iiy{R9HfrWNg&bu|<0Sim z3!O(*J$HR2G(Mm@=TvHHx%9IF6PVoT47%wT^&l%LLKtmpK-|5k5bu1Q~@?1KTe=8OyTjO$7NyKcWh4PypiCo@QF+ z`tQ1n>@c8Q3(5I{J^N>*9f<_{iFFFzi`-!iV{%C7Zct*;lv%{M4P=V;M9@DT1pCo^ zbQh4@HlR*{(-G4JbOI@kv-w&N`C zTkiHB{(PsK*q339`!FDv*Y0I}SJ|!vg3<>Vvc(VM0Z!Hi>}0enG4t2H#=sp4up=;Q zjO?`{C$9Vb1A$Zs^5^ai*8P5DDkYPVz8Y!q2gJ(a{Z;IS!!NW&)8s%w_@C+4JD?l* zf%@GcW!*X1@!fu${OYrP}?0P>F?YdHaYhB@q{T$)YtYgQsfx1|R-OMH;G5 z+rUTr9X(QYeR1zy=*D0`vmWC>8mL=PZObAv;VbBP=e5#NxCD9<^CJYi3%522poQUM zo3qX!VU7IlU~tNU-;1>dSe%S$oPe?Ap9wywBB=SDu7ipAxxvZ0-;Ffw!2i%G0_Vn7 zZs-zz!;bX^jzodzINJ=M2K%-KnTi>>8^*8JR-l^$)WhKCrXaVvw6^iJk+v-f*3cwt z4q5)=S2Ml0UxtE|jrg+)g7Yy$0>BZBF+gE9vvN9-FK-|iDStpn%R6w}`c6~+8K5A>!Do^uV(2o*5KLiz3iJ`eMH1mr*%g(w6Gq{j!ADF&==H84C@ zS1L0)7YA?SepsTw2Jz^_<)9zsU|u_HSx_+Gi}zckgSXB$c(Db$|L2q(V1fj*{5SFP zvMq;Ey|!sM!=%=fGoJuX&Sn9WL1osjeKKyK-J$|!@?ID15`bX3w>Mwac!TYRCu}kv z{=%o=epp{N0>63I)rx!y@Bb5o;=IQi10^XT8#~{;!S9YtW`bqj*9Pzcco7ZEsiKqQ z2xce1Vx=Yy^TlomEMQ50fG_0Y^bys3DUe^CEFNj_?XngTJ0zgY?U;`N$_07Em(Ktg zlM2Lb)j2iB5QyTj@9NoNrQ>%U7=k}#1^ju3Z zod;a=Xvef6&Zd7{<$nj$_V;0IY|Ds-^z`rLbOVONsMdhq_YHN1XjOY);O?td2eK-! zo;!K2|EfX^M}IdjH?;8~n@P=$|XZ8)Tdy-$4kVphR1?+4yvNa`aKMA+uU`*Q`vZvPH{lF+ z&Yd~1q=&qa41OfAq4~~tf|cRjgQ4*wXe9XOUl_)(OxCVAW&prlRy1o^oEn=jS{O$W zs0=^4h+uHS8KM8}vzfA*!7kYn2iq8$_p6|tu8AIS!qkWKrfkSjz7`XsGWQuhnzfD! zsBO85|J2@hRqgGd-r%-bm}4o=VY}y*x&1OS^1`m7PUqJ8-`^Z>`i`$K(jex=kXXtn zeTr!3K&!bm7wSamfXDWc9_gq2M_wJ26*4;NbQ#rIifK1)u`N08`YjSka$TIUHn5x! zla1%G5V>~v#S6uEkQunRE> z67aK_OH0@-7>vP=x!YEr$THiO{hMdlGZS+$Lxu?QUY6<4(}*7$uQ?fqk?FXV?`Sy| zjuDx;ZdlK{4U+konjf$i;84km?n4k_8Q^HJxJ7~e1QRv(7jF{n2xFvvHwDL`=wjfG z0X7L!1HhtUI!qPJUpQt#g%P*<0P_WT)5w*2oteMDe6mP60h3;JKlqk%ZdRw+*_pii z-UpL~>=Q>W&-5!yK0EVRx!n;AXR=yW%jKu9J!{t`a3YTf*P!YYwm}~epVd|p29lDaB@|rNusGryADIP zs&~X9&!0bZ_HyywVHJ6w<64udd$YFoUbo*Y?aiEGV`-sm>uIIWQLd9H>GG+S;gtbq z9z=Dxvaat8b2UP9Ap@&ic|`1*?yD$$ibLkr>ZB*SV;Ae4Ex>5~)$~@w|6-A7?mUP8 z5}l1>iD~rUx55wt;JRfQS^VX2a*l)By|3*HoZODq2ISRiTH?+h5Xvgz<&VJ~&P3KO zT%Ym3*0O!dQ{pRHLNgIj|$zajVrSDX|q+j3e+FZx>>5;6gHCMuq5Br4BBk{^T zZX;GXdyEbR<_lWw0EjoJb~CJJ@bkQFDrPU7*p@{}?ULai!=6q(oi@YD zSSG^P0Ekw`>Ks6<$iQhle?Vma<@4dx0ut|tYxy505afI45!TTO(shmfZPuv8fOgaG5WTW(^gX3jgPLy`2{e#8SDKw%68YhIK`{E z>j|<$KQo^2I$=6mA2I^httP^6=IA=;u8K{ZD;w=H9jV%@+&VfgtZVLOI`JG5x4lKR&*X_gLcx_bUz%Q?~m9aebWRDxW3Y zJ%-~hXFbK$@>5O+b=t>$O>;mePRt7p$&*Xtg9CX_+?*6#nT~ENxTi_dApf{u`d;{|1XQV{pNF9pptEZORuJXUC1z*XECm zRcj?{-8^Q=v6Nhr-i~k0#fJ0F#Hd~NnTa4%li4&sWa#gS# zJ5*Nx4iXIA9-pmOz3g<(YrFpIv-TE*o}M+kO*6Bbbpl7g7ifZG>LsO=wF^UCMKn-w zn^a_!=uTcIZi^kDRSrD92kwwx|l*3ZuF1m+{;EQkKp>R|j`0ml53;ZJNFhmfVp)bLfbZB?+w%uC0-T zgSA*Kh;PjIAOycTySUDlT$&ctvJt;gl9u&Wx9^)}@e-d7i+h}CB-6up)S6jt=;rHt zJyzEvq3e)&aUsO!U3K48u@^yh@#fi^#u(v_hccR#6~x2$m-Gyph>rr_dS%kWmeX)6 z>xRA{xWF>sRM2ffmWKf25C5r=h*{yuV?l5r*7&CfANC*goS&IfSn4I)sgdRG=0VCX zkl%)()pAGt9&V_eJd+N|dz|Rs-Lg0d4}GnqckbTfxfvsdwnPShn$IO-F;|5`gH?q5 z>_9;tah&Aa-fdma!XTyyN4{Qjk(&>*!M;Vk<`( zfC%O+j?B&i0U1AolE;)zOh=G~|9~{x|Cc9z#{5S=%}mXk#ob;No_Z$FauVl@-;19* zeYQU^$U{cw(1|^9+WcIcZ^{q?{&mJMzwS|kXmTJ#hd0#p;xn+qTS;nO zl3_~fOQ$>2GP`m7xInlV>Lj@*llq7xjBIXe5ZyOo z1${Hqo!)M#06av+HDeiBpjC`vC-phdcd2xMCJrEl`H})!zM*(81;NCsEdvJOA6nKO zW|retPLVTN+<}N+`$0pQh(wajACTkDcQ9mYkZtsffJQMN5Z3~Cgj(c5c<(yMvIOm4 z{_}(V=ZO918vD;(`Jczd|1AfsE-nqx)Ca2(Sgxn0@cReTiia#JuG~D#wl?-3A0z#r z9$x%=jiCMLv5%rIekH>{Q&p(7ucEAr{BLAGP&$61hi)~!YRZ0wOFkdc`kyi1zi|lv zzi4@~S-VcC8X7&tukDnyoy&dbr?xZz)ZofPZ?W&7CUC%zwW|b2(QBp_U1-f!C(><+ znQ4s}p+UK&ho{@l9G&Hsdyu&e+#K@$&IyOQHOU(vZ4fqrW6=OWioM4`(VmrF^n>#w zJf)qxa<%U<^P|#roC?Ah^L+f%2K-`XE}SvqDulVvbm)$(5sUy@(WebY8C<_b)Zp-m zO#RRtxitQPwD-Yzx3QOg!JM}am8$Ap0&YVUza=_G%pw%Bnt`B+E^bJLyMI$_jOOi} zoQ_W$6zZ!T6jAN#KEL<6fUdj<+cKtZqX#B~I>XEe#-_54j1k^aj3ypcU9=9)9qvk3 zY@Hhmf~!GHQk*VFVCQk%!Gtng2|VA{$c|M($(KO9<(eE#UE@1%ZDeVOa|^A}Ez z&dyzYUu(AKn&zkPUF>#j3iocCz@@&y$qxA4;fvHvKv}lK5{w1zT^tTcHw~ULv_3{? zc9i?%&b8-Cizjr?hxm*eM`BHkhYDeLSw~p&F1hKHs~Zcpz>6jplrm|Fk+V&#vz zab9+|u*jpT_IB{iN*#`lmz-}IA+*H>PXs3#sgi;RaHJXJCNtphs%MFC$X3Sv~rXk^E`a!P~v|InU?^mr0V0tR(7__771< zKCRy}fqO7SXuS*gOeDWnP4YbZtA|@3?azw^2q;kN`BcHgj+-z~(Y09{6= z;F_D7Vcq@~3wi6~-Ayv>nTQS4qEQ@r5=v-W<2p$~3#98uLy(pFM7rK-VPxj?#+1nCL$h zK>yc199qqHe^l+87pY!G0IF*47nVX7JLFIHo)*^QI+IF1n5X!S?)!EECN$nWd z)w^Y@p%ubl`)RU1A*9iV!ndidv^nzI?gC|cSc0LPyn4DPRWiKoJ^MS`pIJS1h=JZu zA5CP^>3$4gV$gw@%Iv`kuw>TYQ|_`OPVc7XqYDl)iWQkHwm(i)rr*!%Jrwjf!$ga% zC%m|OUXHTO5KeWX)TD;Mn*H?go#GpT-%%y0pPsBgc)Ol>qsu znf0CY)i1_`P#+gZR(NC(V~hc(6Ptt{h6~}mHF#;9-EjWK+ddkGW=q$azD{$Sk)B$# z7S5>#E`4v8ytn6Qdm+EWWG51wU}^bM7@;kEYJm{VFYtvKv#2=y?0JQ{6TH8OnPg9K?nVf=*aKOe>HJp+gN8Yv~OPT&ubJ-bJSKr|JTz&3JhHc)5yX;&Q9p8P8 zJ?^uuz`2a2S$KCYT5-tJ>6yg#l(yXnoD2R}qy9}T2dotfWz*?=?0N?aT&nP(8g zSrm?M$B@s$cxgc3OH88C2?lTH)Q5laX6`xftG#4+-)&>hb?WD=ce`(z8tvUof$VTn z5zL}z%n~&FH=NMVN0M#^=Mo~eNSBql_kFv?&ubW!d?+m~GeM!6s2P;awi%yDg3?{k zQwTwnbqV@?o9wV5CwZe|Qq5B%S%c(2bTdshxPa>Wq8H+}np9D1IseN`1kwx#&DSo` zr5IM^iQ9DNwP}qEv`lz4;d&wESbCjnf9Wf)J%e?+tOft|jLHk9%3mtfAhtj%e_@n$ z*qWTx_)(?T#SPaPsM*2G6f(FtH$SiSp=P?4RM;W>q?$$!p7+gl_XLaRNj2w-pezlyvgAT zUY3P>GG{iZrL-#gG0?Qgj>Yzfl7S{BUs*Jd?$Kc{9E%<7)Q#rb6%u|r`dlBO~|kpS11;^a3M0lTo!UXhTp)ES@s9S z-_`BZ_laJrM9b@@gb4YBAR@cWH4ZtX3{qSSk^Z|vGT{ZY8R-5xH!$QPBumExZi;fD z0d{%X007P)j7iqs$$jWwq4;cR%t_S!B6YOrf;LCHQ>T}um+a3T6ItjYE*;O$5G5B= zN11sHb~3$_`bInPd1WlGYshNP)!Fa8U&F3@JxRVCp)ka`!tu7eI`!!_iO%OsFB@~e z)Rdi6%LH~5K&c#S;bRp|#sLB3-jLISHQ{;PH(loDbS0N8JyNbO&s)Dv-tK({Im+T` zzcRmGq`0c4e7*3(KF2BEF%3m(ZBI(_b2**)zF=glHvq2dT z3Somtn$mKTy0LKPG=@<1tSkLFN^kso%~qd+f47T9(+N4f?@pg);m?4_64^?F*VPrp zL(%BseRkAOMv0=eb}N|CuV7~;^Uk%L`Q={o_(zBr2tw^IXTFz&pIC~FFdBx8?Pl#J>!el( zI-aAYm$%(3>h-^V6{i0yD(%kAB*U=*kJ|;H0&)0H{r$iGf$biq{2p_ZQ3m~iVR28< zWl;Q}CQmf{UN=%C*v)*~ZZw2CN}8Ur&z*A;4CGySA;3LL0(tNt*^dfmV$sm>2}BZ{ zKbY^#=!KMNSF`NJv-!b-eUMAF8ApNrC>b*)n5#ikhLhw1QsmG9Dr&mNm=*MI9YTzhsGOSyTn{kXH+9Bbb z6x?eN-^#3eFNkZq^CO)#a@V2?g1*k0z_EESZyQ<@Y^Rl|*D1!OPhH&oxS!Aah>8Yk zsfQZ2xqtQlG4rNtwO{Ts_Z~4J?U+b!#@>*sMnFlyxn0gbFf9*%Xf3xYWU0rA=ZG<8 zK_C#ZT`|V|%rppc<`MNV4+K7aCj2M;%6-TTEECb_Ou9wM>uG~BA2VEw7{2ZH@7QD(XXhLu_nW1L5mj(Wa|U~hp& z7i&t2GmboKZR^?1^|3O_+@f%iTnI}KaZL)*ps9RMYLm&=494>rMox6VJ~AKavDFR{ zT?5+2*j7Q|<@!NCiUEVH?Y03-)+FFFKMW`$dpy`KyqB z0<49JKOjr$k(|tAAVDt)6JcA!G;FV169zNqkH?iD)(+8rD?twOD1X16JZyAn!Ct{e{+Z8>Sre+JL#bmxi*_jHfz z9A&qm>y!s)LGM~QN0$h2gJ`s{X=t7`j;L#V((ye7v$t2c=0#I+X)Hn>oWDby?Kraa zf@l{4o?|W&-hPViuPBl_Z}W9X9crSU<@IhvviGitP~0{cCK)u#bA-Uq^05R zWy^i8KAv`+wVP_CaI&xoFlSq+-g$~=9Hxbl=er&@W(tJAbh|KJqo$lNZ+Ql>5SEom z+Y31dZrs~0z2xo1!1Y#&4DJ4iA%e(Nbxd*ur(oZC%S(1PN?7c1=CNOwQy*r9O3*161lKr$BAEHXES*R-4(O%-?_Z+uL}tIp^Fy1$OKINMuM85C&ke% z@6YsVSlAH=uR;2cVNb8z^IdAqoSCO@q9 z&rT@+{LRNzRZR-KB&r>D#8y~ncb2Y~uC8H*{?P6Q;OEIsnISg*u0y+1pg^k%dD>sm zO;`(l<)^$X&@l7Nk}OrhPXDzMX~&)kFe{-2S}(b#6Cq4$NE{(j6jP9Dk3J~+K!vpP zAOYVdcf}Zh+XtSCX-2LOkdeq&WNd1iY*kOQ-_eVyvu`vVphO47xYGhk=0?lHJ+ z0t3a^H(4Ly5-k>f;J(>SW8#`?tX%z#cRGmER(%d#^7_|3W6nUmGw!o(1ph4w!I=;) zPyHS2!FF0)q zJ@_Hd#S=WJ>I0qqOXN^D5LCfQf=0Fc!@^yl2}@up4L#yC19{MJ?^9=4idd)B0nMY0 zXAFojhTfeBvP0k$tZQdIat}uE>!^FgNqe_zkzU=66~aRgFNF5sNLCH}v_e>^T)t$I z19ARr?!mMA`iH%|^(2iw~P7Jldt zaR!k+shu=7x=Hx+<#sHu;iaG#s}s4fa_imsZsRZaeNzak56C^{n2^J9v>-Yi5()1@ zCcCrKtO*7-y?RnzU^9$A?L9ZBH1bac#2r240_RLErGB83Q?V!?nCJ`rWMo0}z zy7leH#Zz*zn+~JijSX3ur{Aw7_0k=#%anlftQYTBrNuN~_JKp$1a1Vd;Worocw8GW z$?{Gyj8L0@UD~(5rG8}nz#HwsId?Op@?n-D{{`FKP!OUZ+Hsk19<(tn)K-{rh!WD) z=o>&(DeExQ*;gk;@B&?4j>g5BOYSZQWYEUSzYf_mHl)=OkRm7$&ctS;q&FElFA(v@ zGBj()$&5nkP2K}-uQp|W2ToC+i~;TATMB-(D5xv#!iymq54({QA#Rx-S}Q~YvT!Uz9%yFI0-5V(yUXE$i3 zg7CzpQ;`jOdr8OWhfrq24KTq~>KaG=^EDnKZ`%F}(j}>sHQ9{)o?aI<17VNZAz#{* zK#Q&>IPkjgdxi6XrO%7f%^$I0JM;O|CA&-23(pK6n~jBZaeYY9u1wQmR2JXxfIZz0 zOpZSTtKXw^18HkZV|ioFK)&dTk9q5eYuY_(bO!F|z-;8Ns9J_m85_o1<2Eo%J3&AA z4GT52F8d~`|1&3{%1`lQ+DL0rR7J%@Pq_ju6#Ek>D(-Gj0XGCxC9wtzjxzJnyaVLr zUYCMYOu2DmNLh05!)LiL^^?hG19Ej|LOu0<1@(Q6F^(;SC4Oh|W7=&D1*?!23&6~# z!(D#p>g#MjpT;qKki^k!!L+7N3lI~cgYJs*uIh*2^@os>jOeGdLFy>2w}h5xMcyKU z=Dd+2CtaMT)ZPKQsGUEVu&-!qa5`H7ghMfIjkl9Y;Q@~6q_RfV6e2vy7MF_`=c7FpI;Zd$Enq?eH{rHCh~9fpLc1S{=9O*BM>rv$J#f$L!lz#|G;R9j?jGA@1Ymfkofp+U-J>WGYkEjfH_Fe18Xt!b-2M%I(g%z-RBv35 zi#k0LaS~19fCi{q%0fZmL2MnSzrqYmoH;1Sd%9f6@A(b~!3Au@8*`5^jOYfaAadh& ztVSi_?Wh{cKJd-C&#jse8TXqs%7IePBf2y{536P{XwnXVcd3 zs=%u)+b%HUvRP7(GgBSWx*wbKYT*iWh3PV4%{=s@=fJtB)QJ5Kb8KF+-ZS}-UmH{b z<|9TRT>~XI&R|<4IiDCiCViqSOrqw6?#lW1*%uHT7y4of-DZotAyL#Os>5G%yhtOSR!wFverri)LU`X7& zi0j@+$rtU`m_ZN=(updzynW63774-1Gwa7q-qZ4e?aLhc-oA78g@F7##>6;-`OKK> zJb-XHK!y{q^QLH2VO;2-zs5KFH+O`mzt2$dAam_cEb7I?U$j?H6sOJH7n4sPb{cDlyN+1 z@EL0~Xmy&4+?pFReK9xP5Fe=dKrvI!utp=`3a9ba8!!JwzudjS z3*)^O?Ruq2%H6hGVs)vSL~{rGjCkq`7vGxIJ$lfeAqgj!U9QN#u@FTopp z(zrlJ`5~G1z z-mz#7Dneqd^XzC(p5cgC5#N!nwkuYRxX2cDLOVEA^8Nw)_*K6cwm!LtO-44tI&9PP z_f-{{7SVXas5z~(n`*PKvmQhos@2rtq&)#H8Ne*5l$f`Ei!#%Wm2AXjVrw_LT9rEU z6H}(1O>Wt^A%?wk9tCIM-ew`AmioM$oV*YH0nr{})9Ao`#x259SX#aqIPSVevI{ab zq=rA~%TPnKB3FH5j#SI_sgz^8v^&{P{(8*e?f=RH7R7vy@gC$uj{^FuO(HynvaaBp zO}82IOtPfv)FyB%A2%m7&WYdTcJ|=Xm4z-LX;(LB2i-Q}kPF@0B;JKFa^}>h$J2M! zCx4tLecr!m{9aL^#^Iul{a3*;=qgw;IBGO7Sr{ev41xx+&vMtUS4EVq3M*d-tdTxz zSjou)3SL`C=j^j@T5^8E3V;>>nV^!vSZfUgHW*KAa=vlQ{Nb$9$zVQT*i=c*gX)(F}4MT~c z1=2C}T?0V6njAJ`4BCfnwgJ{&UkX-|5%9*KXh4V93bUpM3_k*+; z%IE2UN;lOF_JSu-3f2lumu(e(4T>A^q8=qFnqCxQn@4{@HhsNES?kH~?ZAjJ^vBUM zps30$dHIp~n4%`P=kX$Hy+VM3+xs4E(;tsx-)Tb(u)ut!n0AY<9&QX1qRmwYY&7a* zODkE`LuQeuV)nHo4(%Y_+vXQUGmfNIH|hSG8H>8sCexaIEalvr*GG3> zhTQ%7VUK_*du;jQ*Ja>@ifh1A_1tu7?(Ne8wfuLrFV|Ct`mRNmgz>CnToD(Fuk7vYE z@O`fG=X2)QcUeMr$3*nRfvlL10*f~cET$Py3z!ARHsV9y(0_$WjBVQl`M!Dyv3jF0 zU6J)V=Rvipmwg{(+ZK>6G-iIAz&z@l47h~u+;y3Eb>S7nP;FSZNONC*-Sm_WYFM6* zF+2e7FTTYMiC*;@x&iuqfjTXoFd&P&Ho@rk_lVe~V!ek)HZ<#g_G|i-_*ASiTfQx0 zFO1NegyLk!C$>q_?m5uzg_UXN`7fw=`Ko_;uxCS9^-9EfaEZ}xOCIL6V|N7r94?K) zx9AV+L^kxUk5;3!M_NuOr_8wI{(6?8T9^K*Mo49U0LSr~*PI6YKG5}=y|kLnc`kBK z>7vHoYgxWK+L^wJB+~3d6%plU4_|89LSxtz79dt&-5yP3@xUpry@L2~>_XT&AkM^S z8fWs)vOfDY$5)RZeLIk@_C+#5U&Ye=24FQe@bz$#*@DJ&VuvVj1$@#v|L}#cP*qcX zvHK!v-_gSc5mJW!jZJMFp)chuWg+3)=>Y4V7bU~H17%QhV=PfWmxQ;3M{lDMCXFNB zes41cF7~HA?UCEXS9F^`BBZ}7TvqBU!2H?w(W=NjJDm`L-ZFHdHiiK?<4zp$1Na7GWtUZt^~45}ebJ53_i_aZc##4&M3Y1GVa$tNy6* zF-2X~zOtEd6Y=Sr&Q%ap6`hAQgqLm+2hMHA?BvY#hc)4EJ1ffE%nfb*D}M>p4MfOW z&O1-xnHR2dE-zpD_8Gc{pGH#ejiDhY2y|&E=()m@T`N$|4sTdflhBykxFhG4YvvrR zV97(&?sNJENXT?+7SBP74k4lHnJxV!tG_^jtdkIudOJ|2I*)YYl7ct8gov{T+vP{& zO%(#ady!pazOAYJY5Td8C{tQ`u{X#sti)w-_i&Sg<@8q$C&yUlWbz4p;D)ztGCFYDp&N@{kmxxtGJK)!;i5Gx>5Xm2#-491 z#nIhQXB_KBFE>9i?3>mZFy|oJPu-k)=a~Ze0a`gz=fMaLQ*L#_VJ#tqWc_f@4v6zz9Cm-NuO#q!~%0PLtzycXNi&Nf3tc(Lc+(|!sc;xbIMcj5egy7j7a z^N$l6Q?=LzP2Q`5&*yL6e6@_$D}{NCzxzsMEd+5CdM~tC05kq=HL7b%JRWnp+2)Yt&v{#fR{lFEus%h!3I{aRx$ii~T z$;sJCGGEj5n+c4G#RY`Xo=`3$V}M_t;I%&>e1>6P3@38KnmK%`sY(yt^IAw2i*@QV zDl5ynjBjuOUnxh{XMNau6%mn`AC_XMa-gV$HvRp(Bv0*0yi#lxkn|i1mCX+)rS43-^c;cn>iYbpv-# z3=Jq4v?mRv0kix`AGqMoVEE5a-y}3Nn@@Z=Rq4!~G^?2{>Fa&G-HOLTi){s4k0lvU zlomPDej?dWmdr<6|Hy@@#Q z8rnfv4_16FU%jGg?wW*%gmzYm?+x*Q5XoN&xx*a~cB?xnDHIIEh$lI)twjPOO%ZG2 zWT#c-&%ty|ZA~gli6A)>pGdc*n+&Kzw)SN1lDm~5Sa5Vw9JF?QR0y^qT8aE5F(cT` zWj>nM;bo>{bL(7_c2h9Ff&z26;mdNV-Zv9C9EoW+Yq43x39aS>i%z`SY_SyT;gFlQpO{G#mv+42JU>9|TPKA@bfa zYQ0-^g(~S-lI5gAy`g>DKIJ%1z1c@!*ExZfP$}y^t(cS&Mqw3t|p^^v({EJe2uC=2}oYZ4WF<%zG#-xpUk z*0dyi9BtKfRL^eV_SYlry6-siSaVn&G#_HUYSG`?g&Z1p=|J3b$#j|Tyvl{S6J-6Q zeL8kLiCH;38DdAowAiAkqLV*qh|5Um+Wesi7h*#FWN4EnfX{bAC8vdZ0 z)H}q&Lt`%5!DzbSH3@tdleb+Wqr5!4n%u74>hG&cRE(WEu5y~!f4)weGxT>Crcr`z zBAl0TFgOGU4d?BcHtOy=ARJNhEtI;Ua5B~rb&A{J+?k_kWk3^}h`UBQ2pSyCY!kxP zmudzS+60ES7AvoX`l;;yC}&lAUwal_pe&mv*vJ1_{y>cJ9cCHM3m=aVG=|U~bw+x3 zg6^W<-ehE!Ty*yl-RC6eYV_p8k@~I|!(}Rxf{l4ad&2uf*oYV?&KD*)4Okh?keVb1 zWZfEzdt}2<$1mcoYW+dW>tB{>SpjOjMyI@6A;Ea^8RXO?ffY~JU%?BYEC^~UP86N= zuAO)i>1W-=KD7_kcRQ+N4bJvJ4@+P8bnA8uKRfE!GuHdx8*78#*BCNdG{R7#+%gY- zJL>;wAYAE|`pKXSQQ?`c1-5u${c5&%5GK$69xOBL2`*qThNZ`R+QwPhxsh1&y!yqP z@#;bwW_yChA+7s5ANIZ;$T$LIL5mBdLu<#$8HrDkB0Bo$NB(}=ojG`^Q&n-ZS0o^x z|M&Phe@z?wkN@{iFuFt?RUT zBGutR;a96_tA|sWxAGV<(m>Z-&=M&1n}S^?(9b{Z2*L22o_uD$r#qNwvTMHOga9xV z;DMA=n%>X!;>qVQcUXdylg8+*VxQyGiRqiCZ4K@4ZVD-_N5A&&i$>k$zrhYc$JznE zbH-lV&ZYtlS{7Z8c7~DVSGAU(`$TVd+|ozmJV}oXi#U9m!QYE^R)~;)|TP;E{d! zeQ3gcw%2a=>B1(`I1=s>F#=Yx24_aG|uRJUC7_d;#u_a3!)$S7Iw3v>R3%`Wxi6c z%A5IbXJAE~r7yP%d9Z*@5L(*@e+kHOVsmz=jO>_}tHHLp{1Ik>bsDZr(-clLgV@n* zDN)o#GO`oSLvwD&*1BgMNPO-?&7}&JCRGjDawP{HlnI>2wB71gwfY((OjW1Z(9OdA zsMWMYU|=2CPFV1V_DZ;ocQujmGaq*$8^?r{?WNMXvZx9DV!s|yARm}dT8=&kKm++K zGTnvAY;d$G^4%7i6217qJEv;D=D~GTQ;u=A1Y{fZS@(~BUPPBhnOncwXUbmnr0YtC z!^zf|3fKuIxAY41J~?>vBBovayF$V7bz{NRpjgbCH>lNOG6dEUqXiCZ#E+{ie&FZV z>I^L3mkasez%q9hfKBb7mxwl&oxx4syhzKUNc_;c?-ObDxrs0tnf#1rz0#BUb)R+z z>cd$9OU?mc%>cxb(}TV6`cJif6lbd2sG%r9yR|l_(SoAbxb(5%X~++XknYX%PwuQQ z=m9J90NSKhGn{OFUa7!5LV;=01%f`;Ois|bFEzhTd}pGLA*xvOoBgYSA--Yu{;r(|2Hy?Q*vc@L#T zTcq2d2XGBK35cthX;1gW9<&SvHGF}F=`uf|_)Zg%1C6NMoee#BHtWDZxySn@lX=MR z%sO~b4AX9k@W#W|w1;Oe*zz^hpOAfjs@DCZdXIGbRII)L*iJkh9yd3fJi}<-#WvVr!aacxs!{OO^)O#7r*^E#g2zf578zI5? z&29_Q)NgTVsg##YK8Ig35s06YxzZ?2ZR&CHXUOOVaBif zqlD>x=wS@Z@Th1Gf%g5%>kP3Bx!c7p9$B|%d_hCq_#r=Qbh8eQv$$#d z2Z@-DR*YS=gWCa!8S{B0B$+<{Wns&2RCd6)It+*B5>&p1GycvypJcSS3&>c>|)S`(P;jVjt9 z{VAUQrLG&u6LmzuKJ5YtLmp}Jl6M)a542PCXuYjyx#fAIC?QDUW^pm>#(Nk65NFoJ zXjcI5$C_4Gd8>ZB$r7{Csynhr_L0NBYTfnycjMOiAW?(soP;O9MWfEP^YfN#_#$g$ zbjI;*u++dUt+Y=cqOv;ZyP<`JFgEZqcU)X)7S!heB9W&>%8Mpx9K*l7*rHIGr%iUh za6fIaZ2xMoaZ%qFI}XtNV1s1=n_6S_awE3pQ-OB&?GC%!o)eJ${(y9t!^nkv7*6z*G23K|+f$!VTy0(#-ywesho*)M z7KW!&Dl}c;#Z&JanlJ8rS9&R42&?-k{lBE0585T%pwmf)KlR-{iPKK=@HiWZlTL?o zPi736EpgqCySwWK*Y6W6kMz5qA5*7w*m_Ta6lKqboTAsgix0Fs3R0VSZ;#%M-ZYU> zS*YQsVW}`?14EMR*)gv(9+H`W5Vlj*4|sO)?(|2M29KCR*_^BAL1&p4Iu-{e#Kj@f z=#u1OOuJifbIT1xv!O!IOb|4u_P&4-W^wKhhzo@7*&8-7?R}eJR_-pckD7mO_D7zs zT`KRs{;hXfY`^@-2w<5FqbYLs_^^U6pw3JVVK>a6z5!pNu8P8M7sA>Qm#t?&N-mD>GY&LC?xO5j_nFpW@l+~l%ZyodFF*}pTy zi9h?dVuAk=Wy|j7>{ve>{Pv@#{e3gvOM!|XzGD)!96n$`BiL~4`nlf(MFaMonhFsx z9EL{ER<>PQf37c3_L{zs$p_x}zq zfoVUB;#|ZgVIU~I;&5nt{zh^mm*FM<5Vhx-r8ZHT$5m@Y`uc_Hckfxz5lnk24H^M} zi&^we%Q+(cb452=gj|!*=;rPEs$tkkTF16No49&HrpV{}J_YvAFN%-6TR3P`1|7nA zJ&|-KZ?6-`lgB5wDn1@{cuDTT{-eP<^+)9)afS4lD<0vpF~DK{=OR9F-l!EUo)VM} zE#4crzF~OJhhGj?Z|vLpmfyi`lUq_Gv!Ib%>0;S4GBr?BR>+HKhOKW?ywB3&7HX!k z)W~|b6Fn^Gx!5&pwN2!OlJxytqq2ydso|l+-d2ZO%63ELFoAzSrg!a3s|i~(@1W(t z;bBL=93CLFYHc%1Qo$+MlD#|X{^gRBkvlT`iDSDZsR^H1X{XLDWn z^C~tfd{gvT;AQ<)K9(Ucz`0Teyf##k(@6MuG~4W8UzH2=0<$+cbHLE+sHFC_Au*jN zK~vIpVTd(+pt~R#>tc0GW+UF@45 zGKSeUXCw7FzM;>8(=PLik=N0OSKe~Kmrk{n2%J50=rCa8Rv*D_!VI!fv8e6N-_>rs zIv9IhilrxAm6nh|xPMj%5X!?;!U~pqm!-v7h~R`*mz& z*JF_vbrIcMITYV~%JFvT;Uej4vMk%`Y`@b%2a+*e9XzwQ41*6GgNW6FEslK>sNVS_ z4jZL?Daz&^k>7{JPAJJ5a`C@4@tXlzgUigf8pnu80r(?fCr*9bn*D5_o7IpI&tNmA z%qRTe7wh8<()NHm$Lp`gsx1s{(GqrizcjCy0VLJ7;yu!D4mJnez9$#%d;IHIQul?0 z$?&zE9xPig}FuF(q@v@_m7pg zS@%(Qi6B;-#j&)+QzT5N8)mD+xto5wio5d{m6P}Red{G2Z{sjtR^Z%ahCVd5DsIm@ zp^l`V^t}{kG6N)NCBoY%;18Zp(ySIm{XLYM=c6lNSb+jYBuxHgp|? zttY`*cxf-9`{L^fv){x^KCPFt@t)2xQ3^I)=Kk#6$u@Y8h2+8DjUSteF6kKXA!6%0 zBxt4$5=a7h?<-%&O|u*KsHWSddeR@94wua3=dAcH{&@{F8sm#mjd*Y>4w6Q4Nm5Pt zBK^q9Q(ZUv1^c`cLl$l%sVBsqyCW=l@3EiuiQH?O`0~yD_Y{G&y%|}C+6UNqCvg@u z{g=}h&C~BtsqcL=dA>rI9jMda1gK`lZ%U%o-yY_dPF%NO(caq3;K!fr(=eLA4$$jU z#VezF%zE97FMezy8ge-DHxxTO6iqmL%KuYl%yyv}ZSlzdg(|wKr$v2p?T9opjN#xz z-Z~;>U!Gvz&+DM-vEv9co#Wh*>E&g9!);QLV|Nj1qcj;XDt(;HHm>&+IO1@u2w(HW z=tIG}4az_!7xm3;FUTiqbCHo;v8;^!c`@-Cgew(aUeJtUx4vYL;sf*B@uBBt6S^yn zT+#(p#XvJ8CX7>Ad(1;e%fYOm|F`VLpMS>?K6I3QC>M&`&1FS;>idqsp;z`JDC=oQYrMB6mSY7xWRQim#vpR66Yy(8i3$GpNG zX?`go#1IKpkB!Cx=E;K6IDU|QBBVZiG$eI1&3i^QWQ9{FGm7H#WY|cwNDXu}=5be4 zNba+YEDZ83#Kqcv2i*vB)(><>%R?MS~xNDe(+ zTB8MW7Wc0+PIlij-mkEKWMuE>V~g$c_l7G9Ta^m;JOs&(4TbUYW+-x$UyF^_->e9C zh?ECZI%TlGRYq~q{6C$^I&5_yhOl(C%qddp(FE&$zgsH^a+^B514%B&gwW-hux~mV zo~%wc)r$vJwVu#O&3!*~_tnCE&@SBPFN+hP8Pc`=x(FpB@pKEn&XFC?o2FCw%MQ`( z>P@E>)csvhPa;@fnkN_ve?9lK4w24=5cgQNEApE@`koxfeN?2J87B7jFl1ado=jUGZ*xY%2I zEyc-6esXOe);fo!SzaMyalS-=a-QE4y#cmsW)K zZ~E?2_1)uigg?iF-5NnB7iC!|qXY3g5FkZ;!59;0$O+UyPXnI2^4N7B4Op}frzw^v zDjjv{#WXR-$-{|mbt}}GBE;>_HAS_j9V3@dC@d)X(69M*R_~0jodYpRbWo-0z1QW3 zc1JY5o~=1_KFZL`&D=}+06zlZ@7w!diaK1-qLng42!#_P69RC;zSpnkdq!&$x^EqH zw(j0`{cQk|Yd-ALrlt1d@y#c+h495}b&eeY5NPBLsOFu$d$HxEr;lM2`&#MM&eEZH z>2E^*r`Gdn(wp>;^0CY+|M$lo|N0RaTm2~MEMdE6ZUbHfy5Y0LMYA*Sl`kRi4IBBY zJbo0Pl_OepDEM*APX7n43C%l3g8j9||Mlt=|9Qv4|95@k*XRDv#5rUfAY@`fZPZX@ z*!dxnM-43BtDJ0YK4H;^({*yV+tuuAx7kQi|8QUkX2mBUcyZWO8+Gx2U5esg)>N>3 zODdQr1HnFOL>J`X^A&3W?VQ$qXP102Jv5ROLWi8dl;(5s);FM-lXIs&5la?E8G8i znX>;9OR%fDvaIKIz6yKGNq1_YM)$kvmuU%T|E>O7ssL=Cy;Aq!v(+lg|(<6#JoLXm$x~o}@~`{*wSK?$->>uWU$ZY7ywm_Dp5Z}PrRCJuNgV4__L*zb$5@|| z>CWGqu$wz}f+eD`>BNmAW19{E>M)aahCty0X} zu1>C^y2^*s?!^jSdHjf^vl+~h6G40;g#_YcfH|!1tL6c0`IWD%CkqdQIZ=Z&MxKp%KuNz9NYhkmV>RCKr%tGd*X?x zkZLX*D9s(qxoPg*-zY9z>acKNGD*%#cT`k)KJB%`)6a{yknsfo5!d}nWBrGi=0DjH zQ(DrVwSE!|O)@Zptv@2&a$wbRM!bI)|x4Zkhb4prP%3XGTn3#!orKyH?VeDAIF z&-a!FJj~vfy|pHvN|oGV?W?~b@73#ydSdOVj_yAr#tgf%7F9F3ZDBP@DPttqfV)8Zu+t=(%!V%zeBj`fD#pU3q-jueCEUiE?x5 zofBADIbB0_72l_A2d-~Z6*jgOAdJ7$0xEZ1$)3?a5Nseih|NLw9ms{D<|V=YH$M>V zfv{-&gaT8zJQRd1IP|(m^lJEiAT~y(*Ux={rgT8JQwuo}ju>iyXj&*nr2a zl+Y1{Zpr>iMfzSx?*#m@>rvm4JGbiXZ?t5k+B9&}ERr9#9}+qh_1HV-u(jy1*{d(R z6Aw(rD+9^cKdx9V@~=zM@@Jk3{Wli?lMF433f2LErc$da1iptN4daEO>9$IJ5a@v@ z?!#y3^#uzCDDIA+Y`{BGL(*XU~ro2V6w zb|$Z7^BpRr9{tVXC_w*yI+jtty+$^Ms!S7hL&=#NSWOy4Q9;cKm;q}xvG$K&ye=aX z;C%x<*XoNLD4yk1i@tkSa`fq20<%+CQ2|3PkZ*{-eTL;IHUHT`r~6GE1LL!=jC>9@ z_B3?otY48HphkLco$4TzlrLC7bTloaZiKhnPw*10iQahVQX9ven^${d>MZuNJ)##0 zrDh1_?K4U7%DLA&G<|o%l-bfwRY($K)FTN4J3 zj;^nCr2IhG(467G-D@EX)zL=uhO#DLYO+lLGiDFBfbG}u0yQ@Zs@I*@w7@Y5rqi;4 zEea`s#u&hRpnkhZJ=15Lu)z^Bf+CGVfS57exw^zaMZ>xobQwS|)_MXJ;6P?fCx%`R zi!KrIOc4n{*b5|sV_5=Sn}DWzkgH3fc}6Xn#lF=Xk{YNR4t& zD{&_`%PtOl(MAkfq&?p{EQ8G?>qy+}y|K3yQlA`~*@GI7yC$?~VzD3Sg0=82i36mB zhs3OsBN-QZ@=94)d|`_ep6a$QqJ=wi$i~T;-~d+6={^Y^~4gA=1d5KR_wjkU?d0$B)gdLk-+w!M)y~PrAhnX z3yjkrFM=*|AuP{?Y^>EoE5gW1^dDZGKJ|<$?0e|H@Zy%s!SpCw&{iu9%UxpZBeT`! zc5EFPn(m&HxZeKy;Dy-hk(=fzrozkH+o6zbm6nS^r+@#{ zj(Pd-QK!viJV8+jl|Uew6|5iB5&)r&i30P$yENn%V^vtEkFZ*cUg6dj_?685zmEI< z6NtDRJ&1vgM~M*g;a~?G+X}kMe;@#cpM2&C!IkkF{S)t&7!i(8C|o-Rt;^hj#`G`P z7_RF4B{RU51-7UfdNQ5i23l z>pVkL(Wu^#i6=!dnA;JCETjZ*+? z5}eG)DkvVqt_!y^kjoEeL8I>yvI`EGHuHP$t4LN@C;*q1CmF$2D$z^S62Hyg{?yN-(S%$vU;+MUuA2>=m0}UJc zozo&0Aeg5?H3!*>rS0!$B%;`%FRujQgiD=l^LvwlxcVG+V+_BsEsg!p^GiH!@N~VK zN!bgrIex8FKcP}zv_9#g=kp~CRgS%eNuCosy*vkXc=cSLt%CAs_dhev{-0mB`cG0* ztnFwA^tvka@CSk_N1a{-QNgMyjEMokeYig~aUoegZA$&^~a8WO+m zCDZ$RO-UjV9PMfFfe>;(Oen#A^T8Hg!&2lkrlDOf{ftMR1Tvuob;7TYKyd=92J_Yx z;9SY|X`Ln!0w9W>2J#2uGCPXNQ%c+G1;Q5%ecCrL)yV}~;$4NF4);HD{r!8yvggHt znPQz-Vn%{oih3Ax4)o^5-as+98PP@Ht}5!gQ0Cwe=;m3$m39E?IN20-<95?UK!X!u z8of&P0lW3E1(9J3zQ#KhoZ#+rl&9JB&(+x2g;L5?D&Jq2PIu779t=FPfk5XwKvQ!fRb=MT>A%D~%RN z)Mwz~3@~PG1I65TidGF`@z`rn6Jei(xeH^<+=fzjE=B44U(fyyW&vSRmqcJji8xad zssXKQ(oTVdZkNf8|ADAP>u;da%{FFMni~jZ1je)ST7`|3KJDa2jWU@$M@1T#xeNYvK^&70!1-F zZ>%9dQ%|i$xBmVd9O!`_7?W%Ivv*kreyoFKZNM)P_H5vLC6G|du^$MRJ?kgwO_i*x zhkzYI%R^(iSjr`wJYWH5VmB@TuFIPwXGUNiGpvY}$?9 zIHhH(`~%^g*h;tWDc zg07_aBK&OqJ`yOW7om9-FGe4r=Y*h#t8c;PoVv?X@V@?v)|wE1vt=b-wGJ~j<8X55Mlm&I2c=T z(4LjbYy1Nd<5s}DgJEtyyXDUo#y8(MZ?afxre`RF*nwCF`XN1J4n!Cw&KuHH+F zGQP1028nGv^1fGObz*yPosvkifTS)5{{y_z^>?*0tO(TEKMi*NcTo9%nVDZ^=9iiI zLy)rLV&AjD>PRRvt7k}Mp`?s>P1rs>-MYp@>KH>7WC3oV{gW)_+emi0H1&+R;<>Z- zl&_zhrvk3TxuT`B>JO!6MnKh&f*nk=m3yiUddlm$Zf zK$wqhp&Vp#%j@v8W59bwpMzgqK~GH*kQ5{>;%PbKXN@NwqUikXRbPUxzC-wB3N~b& zjiGmX2*v@p(Lz|?1xB5-(49cmMV^!mb6o{vDnCQ7o?zD373vv*u+(1gCoPcYm>gxq zKz07iGEnVFpn=b_DMI8~^yg193`XKO48D;&9v~Mys3K9EVS_cy7uPYUap}SP{)Q9} zVtasX7KZq)q$SU!o2njUyY1bw)`a3EuUGx~zPU3bPs_2F`bai6ekaVQVqE(R*Qng+ zoVdsO?g^uEsY7&a+6iki$bzFqXma&OT3mAGE^0QWJi|yf70BiD30y^NB+Y2_Orp2p z)R`4H(QzlKI(O7IoS;fs^mWchCdRK{tQmC5%0H}WVSQ5&*+?=+3Wugjx=BAv_O{Be ziawZccIx%;aO(S{pZNoL6t$vP$qvd~L0~ObJK*%o&=dd`NQx5Nl)>mAA!z`EI7e?t z*=zy#RQCfx-avmofMhW6U>TT^`s?^H)E_Ui0==|Et*@iOlF~(@m(exoaTE~Zn2z)W z0E`WMSu+4!XWyWgDG+!+R29^B$RDah({WIj2tc}A;D1GT{rjCD=Wn7v{Xnn+__DXT3QRv30OPsG85SMIaRH#Qal<#LW_B{@G;5L_b%JI(R)OAZPuQWc z+r!}+<-L*H>!xeA(h*bYH&}BedMWR+w~xDCI$daHC;hdow8VVg>q|sgs$|Biy^HO| zF!v9{&;%UX9{`c=5Ey=yKMLlgF<|fSc8MXb6NgNmN8D)<)t9iw?aBc0Jum2YDEL90> z_W0Xp%Im{*-A-A)YrVrOcUGi=U%YCZ_I0%hct@UEkR!QLgL@$dPK61!+734R!LNDQ zLAG7j;I7Y70VfIeDZ@?&U*E}pqIpc~@%I}8)*7S8)th-)_f|$Rrg+|vsJ#;HXPN;* z{^NsT75{xFXn+Xlx4rOBTqHJz6qpUu%;YZE1j4s*h{ik*q z3^^@@`w2;K_gC}NPJQhBT7R#5&{5ILzsZpG`;!RQ8Jh&ty0^D7xL1=*0Y8DPUD`68)cXh)4b+o4_u2NrRvu^EzE~@% zo69}%Hs&smy4&p!XgVmzS0_v-&CR-Aebd!=BJRxKaKqnjp8{~w|9aZQ{;nJ7Ppr+K zuph1xnzTK1N1Rz%R|%SragdZ16ck+S_^PtcyTQh8@!@7%&mk!;%xRL`=Z{CRobgvB zBqJujCiQ)F8t@W7daF8B07rJhrsv5QffaJbWLpsU``GP^hkqS%IF8^;G)lSAMgoeGwRnJ~+;Yz%0Ri zAg=Ay#WE5>kc_06kCxjpc@(xnW8cv~PkGlv*Z@XLA(FY}L3MU7~vX?y%jWgU@EU>y`c0%-pR8CZH zTX@GDar$EfG0u{I`$q*qk@PWN){^{CDIh$yZdsk)x}GgG?SsAL-iGlk%U34X5(i+938w2O zV;&3J$Cm~Vt`R#Lm$o^GXg(VeO4L7b{Mx?;HAW`cV5?C)IXyc=~LXS#Te6Esa^ zGh#D&+jj4liftM;r~=aHVF+el4>Tzb0)tDuAnM>_x$!6ABWo-OfEw{35PLRhwICX2 zyfs!KY}N9frnq#NzN$y}GyNU&=Cv{vm%Wn^b1(l6DC+MD$^N@9Mk0vA4XC22h7N&Q z8#Qus-CH+J4~s{}KJIQ-w1|8@S#_zDvv<<&u$}rjus+1quL-@Pdl`f~BU!-70LAnA zcLZD1hU6P)Y{>+8k1*gphDQsdncS9WlE*8Eq6@sjqVtz#IP^7&rv90*E(zR_hw3~C ztWKIE8RD~meGA-+dNLJ+w9n<(V6IhAwEj7`pp6bVvR(XvAlmf?&Pw{X0SDii5zurP zxRt;vBVbs6>qP%`Q@>W~*Pi-cdHUG8xtn5|3iaNeJe#A5n9=6?v*9$Y_-C&CVRUo! zfI_{mZga!SRmtk-eMaUTFj^bs@QZWsPmDl+kAwg3ez1tW(NR31yl*!MFR!yXa!qu0 zZvJP6&;NIK{jajR|M2_t5)4Dz=K(p862L>h(SkeXUoDb0IL~my--8E7-J*edQf*YVjHPVM8e8AMVbo{ z;_y^z8aWVX)K_`^-hmQz!`+Vusjkv44@|<$&aB&_6j{G-bZ%mtp>>d2$9nUh`jC|x zW9;wDsX|u@y{;r+)Npo}DZPcwt^wNOc?dq1$FDpK>Ne!-F3d?tcfYoae-QSaax|qn zAat)k(9~gnVnQ6emauQ@xvoGth>JZv9s>0opqqyxXJmt(XP6y{}id zp{Md*&+YfSX^_TV!t(T`C%*oo@x0pYS48o{?TZsk7uB)5V z_1kL}iQGDyPq?Rw^NI^k+mG1o!hO|2Y)Jme6w3b^uZj!;T#PqpiW4jd zDBUAHfQ_&4a#J5Dx^GTnCNA9zmm63`ig_gpbe*F0rn9=r%Ano?Why%oTRjv2y|Jr9 za^K{3+$c*K-?r3a_Ib%oa;-@BK(%qv_~mdflanV{wzo)=J6ZcH#fhN64FZh1_ZE?r zQZJbV-M$6es>1h_R0TCbVeMwz)f}7~%_0MLsoQTenCMQyx#ka)!uTxT z;BS@F{Cn@hcbl$Y6Gm)<@A%m~i}1(A(tV+p0l$wZs_Ka!h-`T^e^ATbas|}`v zl++J|?pd}GWtQdU1v1$dxZtgX8(1FCf|^8=3y!KvRt_;({k8c%uT7P?(?hEJ%gt_> z9OGjdgIEY-NrcT>k_>6mR0BTDW)B(jkYO~Td2qq3uEiS(UMSJIbP$jdxVv5jPJ+B|>yj3O84b_5gYV?bX3 zpCHwM-2ZSFDLDkr;C)6eUjg)VO$JcP3kFVilSvjy=>$o}5!c?E* zP}Zli~9 z03{pR1#;kcc2fr@QV=Ik_Du{`F5De#=(HwPzY8&+Aj2+ge4nfT>>ZtHd)Cx`=qA!X zRgfGKrm=;HeW>R5IFEL)P99@bUKwU@(l?^BJ@Ucp`^VUQUm$pn+}NwFm@3N}L>guA z$x)A!EkZL$P03{MK~W$!v@2olOhrlgpJ7o>V~0c6NV*mdUeh{rGK! zTlfLOM8ex@0h$MSf8Uaon}S4&hok1BWyE%^vnhP1v^U2Dg(BiWuk`hI@5+kG6DSno z(Om%|hSwk0W!i5wQq({oTK4*Dx%5F}&%vSW3&}eY;!5@teJB;15l3t_s(M#;oa^wO zD4u-ejpOwr{+b=N*qs`@WYN05ks~D4hbv6y6BXv31$ldssMfVW4|AJ{*R1=-{OwtZ zKV?*E_5H-_>6h%yMWDuW$U07ilkY+-D*}y^Je^hac;@%>2t0l4vW+9qK|RXgyT4XF z4NYzy_h|0mvi2+&bpg=Won!4T_ZBSLWFj-MU{aAj*4E4B>=339^t7 z(V5N)8vCb1(vXkPWAP}8qacc^JPLl3>y21}#K3KUlGrsu$i`}Oup2(C)B2`BoKU{t z={N7HqV}?F7QPypA-z*)M?=J2pbe$P1$T~ag|!B3u(& z<;%%Wo4+M?dEuxsOQ0+LvR_--MeM`U@E?eYgZ z5woh(#2s}|9Bm7Pc z-0v-dz1U+c_0k1}hY|bWP9VCu8}wdzduD@lcL_94&vWd9#eHqWtfE-k&-+&&}qg!=<@4{J)w~F&LHA%?~rg@r=`WQam zUh>3B=&BbV%Tmd|LC{|T3t{@GJc#c$~NYH!bgyMca(Wl6GZ9%RdY zEZxYy?-;torRsRZ5smi4$I>_z21A-qd<<6rla^=!1&pgSAJlz@9%Z0$@1uCH^sCk; zol=V}Ku4>DWf3ZDW+V~nI_UiK(28XmG8qPhxsR8sd7&>EXgGHEGs15aOL4~*g)k$* z7N2Q7lg0+Npd~bEO2YsOOem=NoM%~!M7aDw2xNSFn)=Qr73%&2Oi$a)WU&JOPRbpM1%;qL2Urxl(` zoWFl6cVErr!&&}XIy7;TYAqy87|TEjFKO+vFWYF39XCEjKWnTtqdL^#zP3Ns>!qjL z*=tUjI}L=#c4}+l;T@(Vx$)CbQG3-yq)d&s#N*dAlL<^x-4f=gTn>IbRguEa-&rfvExkN2F!z3jD5pk$DCC}o1RbXLooL~;iP%z9 zQ!Da8s|(PnGZ|3PJlt+Mh}vU5wXOOBS{9?Fi=cp*F_wr9^y8<2R*NlmWanUpbM}jj z^ci<^wZ^Xz_}RtE6v-Cb$n|d;EmG3=E@xpjcd~sREe)B^BDn&ZD@&!B;|qp5i5mOb z-dndl(s1yaV_PO(Rx zVtIbxYP)-a##38G#)Cub_bV@R+>vD+L(-%HQ+$~;)a1uc#xx6PpIHsJWIa9wlR~ zjypMQJFXut5ne?^J?Lbs!f7z$wGcpi*vF1U85(yER*|z@FlV1jIdLkO=zHAwt=A1N z{-;l)&L{z53w*~kZc9+;qv z;qvsUKT=>jUd4W<+bJ{VBk9Xv=C*e2En#VhX9%VK0BvE=;ewvT(~{wue!|A?Y1RoO zZEs&ljwhZf3ieZpvCeBZaVwMl15O7Ympze5XqPr*5kFGQ3TEF^6b9soYc*iz(A{OQ_qqXv};Mh#9&0io1)?fvGARL#XsugGf z<8#LD{k7GSl>+X9Zyq{D_VzOS2CPND-Oar{Cb#JwZeo8X!rP9)FFICGZ!I&nC%Xa9 zK|>pKR-Q<6GARH4Tp=*tuNiY@j)w-dhqzVKcgWtKBbBT-0*rmEW8YFR|mnH-6;?>HWHfrHRGvf zG*dctaF@3COsuiYI7F7 zOL_CJZ>9T$CnrDL>~ex<)2GGeptCH}so-$0k9&ZMe60c6Ay3kY=i{tG0Dgyp#z4V~ zh#W_Y_I>d*rjnn z(#X4U>tpv?Th%|@7@p(aA9wQHwwS|Pj$VqnyKUNlEz%P?aR4Pj7<;9) znHHHbBNB-ZUcBX;r-4>kEw1bDJ3Es&efzxXd54BE+ZV%Phb|*Dt*9FS`4ll9q33Uh z{6L(!=!QVUJL&4z;5@tCE1U&0S4+X~GZ-V;Hu7-H+FLYI^$c%w@q%ls>*k;)E_RmN9eWS+XE$M# zSxWz?TR(>;nK-W>CWI2!K%-MnX$rz~t!d6#z#6KRtvpoCYn4D@$d9kPsg!e8evu7r zUGo||ltsVAnX05Mh`ymPSWXq8sna!ScPwdvhMlWzX#H}LZ_gb7XTag>&EvbL;_5U$ zT{IrLq^P}3`0OFX(E^$bAwmTT#g^GMMN7_VOme5Y6 zCnA!f8JmRe=pyc7d8(CkNWAcllnHd`MpGi>H9lu$tkrK5*Z%c|kI}x8WFfdyd-|1< z&a>6_u7vA+EQ>!7rC{v@$9B>75K%~+EV>TLAehuCpVOk1|ZF~E@sg6Onf$)YjrmB zP_0zVj!}N_gh-kUwrOb-=mq4(*^Nhfx5$5KD7G9vn0?#e2O@#BRgjxNe>+vc$I=MZ z$}dzJ1NrQUfo$V*I4P}C_Yq-{kWs~LE=n?_g|8(%c!ei>ns>k4w8r3Jg^jZt2vm&` zXr{W(A9R1#S`a2L{p@?R^#>sKsy|TV-AAY`5MZ39?Rnu@UX!3S>S)8#t5-mEyr*y= zA!x^^u9w0$j!WW@4c#kOT&e2e;YSX{wiRM;7znFdVEr3(%{X#Im+%}OIzM*J zRa^W!fo|vcl7c7rrd0!@aEk%%A6g`4IerhM;9 zR^YhqX9Ln&=1l-bdY~1DG~^XhNIe>XGh^cYBs^V-=m+;YxE*4EwDF|-`v;pWyWNV+ zE+Ehi{KIx#kqA>TA6Ak17(E?8Qw8|ARa;!fpJ0av(G`6d1(QC%F$|8B zysvt>B7o9cW5W?0$D@YWuo&-H0Oo1)2F>_4u~QaWJXid42SqP)D%sQ3TsrmjV$;yU z`MaMQK+3ao%hFI{jrq?*HSdQrA~k0BjdWi0Y}T>d6)Um9ExMImR5sqFJk|XCexah5 zP4=v!V`~h)VyYW~b}S2o|2&mW2P279N@v12u|{!q#0yjJBaalL|r19LMOii+la*SRT!$^1ns4<%hb^f%U6o^6oh>VSEPpvl<+|C8ds{NFN^KsjH$y*f zGTepmu~=B6YhY>hRHMR99IK;;>r2MrBNu~{oo>BL@^Q33%_AAma)nnIjNA!j>EFBw z9RiU|od+sRY5KNAg1Bd$wOpe>*)|nT_IchG*FD61;mi9|t#XULS=~V#8ZCZ?xQi#9 zL2siqH&v_AvguZ|e!aInro=tV&FT6_cPDjaJ6w~CQ9b##Jkp9*`2wh;SJ?l3Dl4p$ zqXTrJU=+@Xei!b7Ir?xHa%@2UOwpb_b0&Vcc*wG(EbYV9h-bTRY<}jZ%(D9U?^4wK zn`v_Yy4e&Vn?od2K=tmKLi-Uk=|&LINo#xKXY-_{6wdj5hs@7OX6AeplY2Y7^-hC_ zGE2FbOQT35&wfcnPR=1dE`P2%e{pRI=UljrP&`5?K!>Ri?rUwW?!6Jyrt-BZ@FvB0 z_hjXR?Dmm`cx~~cG&d5>F$wdqIRK}jlCt&7fWyUT>NS+ZoAUi1uim$=AR6aS+}wHS zlVVF6nW7!pmO2Lo%4XxOX^o;5gSg%vo6$ z+=VgJCK^@F2oj^o>5%!%d3|4WsW=@GEiF$JT9)F;y>$!PS)0Z=i|FXDR3HC=;G2n? zu9oR6VPY|Orc)L(=hAU7kX7n_c7Y3cAmsd0`k`7k>IWc#vJ?W_WqA7a#>2FIVV=Rev1&_*WlOGA4u}m^PEtS2k3t zsn=l~ESfv1rw>GubtoyQkM4T|1$zaF!)@)32s7d9ef=Kk72#6QRX4j%_g zBjS!bwirFXknM;j#e89~lf*LUM`@?G7pAWBINKw~Q9^Sr9-v zo(|?=sTYDK_SIL~zVSD@*3YuN)UmzpbS=rP^cnX>eW9RT26at^0siUw2-}EzX&iVq z8WM!cNadD$ScIq2aZ)>8dBIKoMRfLw>+GM*&VOj($w}NF%Fg|4e^i}6Z=)XzNg-OQ zIHXyEy%VP^fma=L5;X6*QX#Mmbr>|{42d$+1GPmzvwR;U%=gnZJp(VrH@u&!MY3XN zeZ6#TKZTS!q|O06OKODKE!XPy#q0g zXhdA~93AiQ*V;1^J3YFS!T;Pdp-E=>YM|Mhuy2w$TW>Zt$IG#?f!b^Vexfwi@un~J!hGyK)4M`Wp@Kroub^U+X`|^0G_x=BoqEby|O_)wdk)}ejjuwP8 zl_cv_6cQ?l81oTXvM-@1qZCOcd$NpOlC0SoGq&tAmNCrCr{CK--Fxmi-+LeT-0wY) zd+zV|JAdiXV?MLI-mmxT^?Ys5*M&1SgZ*5MYZ2sk`T2$UvCUJfhkb19hJv)$h21@$ z&Rw=rY$Nby6%s3H7#~Gbb>;pBp06G}k#a>*=j=Uh9?1JVg}Aj(=Lzr-CsYuZy$^ae zbQ&-&lUlc)DP*qd-?GOtxDYyTj+i*1cWF&dlt+|G5>mHGQJ%lT%(5haQZG@X-{Qz$ zm!F-Le{7o{^gc3aZF0w2#Ah1WTYXLcILE}8769(YQ*R3%-!i1{(gqqclJpV8**)f*4&$2r^?PF$`@g*sV0I`!i zp)oPB374Ex?b{-{y38sg4z1jDH*9As;ziIwV*~=x&!xo=x<(|fyRnqu&v6SzI2?Wj z^SX}gTe;so+k&~P zKlI4Cc&w^J-Q%@gdeYIXDSiAT++x$nn`CBYX3u!r(P38M=;$n3wQg_H(NKX4#%Hb} zxDjg+)pVk_`g;0SJ-D0>I``+CH;q!*Re4`_TNi}lvSr`>U=GM?;xr?^!J#Au4+ z)o3IBMf2SXqaHQ~8g_oZHW9RCX~VNQ0j{=_ypOH`3fqarbf!O6aE!FF2zfibPUf+6 z&(iwrTB(Ph86WWtiEd&A7e=WZ)Z1pznu${>#Uj*{3cp-*Bl&<(6-7f#z zEC6yZ69R!1no?6W9SDiuG{l+_EtvP)4AT#}DH=NPc3SU3b$+onvG7;RFP!1orzJuB zn@*Phdp+uZ2psUD?P(-RZ$P*+VV9}9f?hyXSIoQI!nUYR?H-z+_UdL=$*ZTrpQ4{5 zT>rRMHr|Rp1rS_3Wp3?{tNw-%#cyWL_iNZ15-Z*lV=knB4a`5;)2Sbx{euXtq!SAbOM}OWA~>K_=A*s5 zX5_96LnoWAR8>+(+B&ZMzCuU@+HpWA6U`!hofv@M+ZH3dGXFF%e|rpE%sQ5_d-njN zkmcZBK?$gEzL&TsH)RYzAwd2X;&bb0?$D^9EamvV&DD~(BD=a?9!w9$ zoo1mE$2VR&A{Wy$DH_F_V#ht`;CXv$YI{!Zy`gy9a|59~rX7zEyf+LH1YdVg1{hiP zxyQ6Ul1YR8-Y}yL-@d(bK0hai-dwRxmbdq)f%i)d?<#~HRG|o!=|f+y)7@Y((4h6> zob~_n|HWkudIY9xRzpsx^=w6EFkP-H{$7i-QoWi{ONQ%saG{GvacIXS+2;M4osu8a zLtaScA_x`#?wky-GF_HRZJ;y+y_uAWTGQWzein4DBRx4#%>T5?%1sg42!aw`tpK#c zqIpJPsS?hU5}899BMI*h7U{X<-un1;U!2U-Am*#?d-N@Y{qA;$<#oErFR`SDEZsFs z+c-K8R(D^9+4cL2Z{p;0?k9h_EiS_luAE*vI2{BrPg_8A<6*Go+C5`&VVN8bbif8G z#XybP*q?aZCeZiDbJxkB-IDeU)Gw$n06hX7H#_wcIozvsKn@4}$PIS^e*&z@3|po) z^^>moh7ij2TDL?i=uZOWbRMF1ys^V4}_XUGJQ==I4jQp)r1!Lzhap? zrUUUV=o;L;l8tmsB~HLSu};6~s#Kon=QO3Zn#h=k-nV|he$vqrTg$RY-l|^1yY6`N z36t*H+$B`V^~;6^0kc7!u4`0u#PXcl+xEp_jplTUB}e|ZG5LS?xqq;G==Vwe|E}-C z4SHkkPyqwf)H0M9UgO-+>~RnA-r|>$k1TIrN^&kYKEed7Lnaf3v^ll`Xu%a{IytyV z=a#&Qf87-sGjp}ICo4bu_90BamRk~%Z1xXUz;Vy87#F`G?!N^6&mEKj6lc?mle`oE zHo4Ockj)0_`+!M;OH~8l=wbvxlg$UZuT7qC_CRw__1V$)NP`Y0FyLBjKE<)7WU!-{ zI2Hs7{l}-}$~C`!@0OUyGG&z6k9n)UAy{1Y#E+^R2Y<|sr%o_ok9QUTx5!3>tm zH-up+M>NBosg;COWpe0+6~yeC3i%^cV$fW%!O51K(btqHGWJ~8Ed~9|v=utWrMuUR zJV#i=^_ZLnPyoJySw(hlDQ)wJx$EJS)aCjm_F3lZN~cZY=K^~H<8lAU%eRwS5`|hQ zbOtDbQ&xf=UoFm~n;h;+xA@w2Kz*xve_waPrkfku2kS_yDG}_)XZl4EOFqjpJ>2{i zuo5ji7_H=msnC>cnKyX6f}X=UEoa)MlU-nysn6Scy+^{1+iCg5p!Ggsx!iFyBl_4?Jl64khbh155?+(C$?!#{q6@DX~(-6F%6+PK3-n=QlEEc<%(-wJGrm?^dbM*OZPwT#UISA_pcDSu-Tj- z$hLv01vxf^n^opn8OQlOKVWt-i*j1y1TMEKzt?fVShLx)k@u_kKd9t!Eqbx`K_88s zK?lw?dN>@|x|5snZ!ce0fr6l!<9HlcS|)zv8$z$o8-pUO0#;tG5st$wosuT#Rg!0v z(!U{`K*3KH5*S6=;TA(d-Ta<;@c?z=1o(t6^&5iYozpNRPROl5&B#6n z!w1mlaTp8s8tOkX;S_vBr1TR)O^Wc_PV#=l(TvE83Ju_vx5IH@J4fV6-p?d(XGqz~ zlb0^h$|vhdegs8#HG_7=*Oh+4;a6l%cp#)$H@CU3aa!rS)OX0VU6;;Kwqj(0z% zCXP6kV7;1>;xx0B5QIcRD=sJXF4&zq?SI34q4L)0F_SojU~w4(f30xOo5l}zSwD_^ zaoUu2O?AbwmE6Oi=Pew#>eG|_<)`shUqD$6G}z1dJ5Gjjs)JHarD5n~9-#ac*TMymW?DMb*2u*OYL!$6}QV zVllM_gM#{>6W8F;%2iktTZHMX>_0bTnrpsZ)5J5_03Z5UzgI=%t0}&e4d7-DIl zKcgCn#5*vLfVyLdsQTra6Kxam!^cu1pr^H46N#m7 zy9UwmF6W8cG-5@*#75d2wGX?|J>s449^@))g6JE{hY;Rt&rO4z5D0hWNlP$9@(mHD zxeYJPyaXxNwyNj zv*Dgt442fd-kKg@!Ru%(jBtAXBsXh6GMKnh zoj(OTpd!`*NXM_VSIM6*3Nto!(?1Y*XzT5lnx_vLOMzL|N>XNi2+1*xtw(n|3mRCF zi1GKa&#AZ^W(szrq3ojD#B9{t+isiK5BIzFHeIvhp5vo}g~O`u!u^_$sgsfv?bKjc zQYrVYS*&l0MbRy~i7><$u15s>X_|x$^M!Eb#ZhB zS2A$M%!JBQcF)El6f?P`9CO&ORxkPt&Q`m>N^c93xSx)-OCi@0Ad_boXhfK8(F7M?BGzFFz)As5 zX21kI&HMrM1RY_(0DqH!@mQb^htH4ihD|wj0Inb>Lol`rP#KbGhRR=HbJ#N95F}wm zD8tltreO*P76@Fuz`_*5tD*8n4v91}4dyzT1<4$bI!r7CK+vEs%Np6qA=C6#N2;Oj zmWri4Pcm?pcsU2h18wF7cW*67f7J0yusc{=dT==RN%g~9+h5sh>(AKB3tOtitZTbF z_%ynGY068DrHDUD=dB{%N#th+P@h~;vb@q|`bPVe&cXcMGuyhJMIAiUSMqMI0mI2G zQO2KQYLy^b2!iYRS=egP7JI3&w~CP~4zI+>tw?s6xjm43;a=sDH9D?ijso5JHgB}` zUS4zYl*jG`@yHVv1T~OBizVF!C-YZX=_dRqg>?iL3Yg>oQhjI*Xn?4yM~%0cva?yh z%}AlTMF_|@Dbwag#Jc~zmHE@H* z(NwlPvYQ1HV1BSBH#nZz@Ut4JXZ&7z^jk!O`o*2F1M8)-8it5HU^52XQNe)R*r3q- zx96<8G#?A?hKB%j;AK>!vRR~lu%bc?Z%v14a6Yjm82bB;J!uD?xLqm#$Sq(ld0u#E zchBrN?y^VfyOX?Ss_01>mJM60y~v_uMI5Ea%p{;l+w*BstD}r5HVQB(tLgC$$u)?L z1kn_%B+CZpjl~Yfbd}uG*wgOXL9HA}yI}J~E#X%e8C<@b!{s;!lx$#MaOSDCeKl(05j!c`b+XJ_??C93yIS(#d3s+lJ5{-2 zMDe-#hR4x)FMH9w3u@OoUR-=Sv>~8VX-u5ARC5F8Imr=q&*Alz623S}rH9kQSH1T7 zK%{yun|Lw1&g-;WxqzU*;c}6I??<_pB7SnYUYk9x6uzkul0+HsY3lWj0Py!q16D-- zS@l%DA^w}}pkc!uR1WW?a5|ysEQzONzw%uPz6@l;t8}}(X5X2T%B?+1={AmY!zI45 z^>(%bmUz`$C%Q&oM^CXHzq@*T-^mgGZq)hql$$7|doUw}^Gf{?=xs#bAqds>ADknt z8VkOhdN%ig{;7(qixCku3y#rux{55w8s&pU!J89?i(f@|^*I#O-m5k$H&0w*Nd596 zSSkN}6U0b=u{6KKb^7Nj^-p}%c6TF25GP#`8%o{)qwvPjrCIg_MN(pKSfsieTxAJFWmlmVJJj~ztL689Z%O(Edw>rxI&?Qf- zVu`SA=_55Lf7P0MCEMRTP;ykLjPt;-4U+7h?yC}5ckr^*<7SNJ18Pb6qc6vHy-rL` z6aEyf4c5Q#0bCnQ;9+lS1>4u9ubQ99E6&M3_dvin5OPj2isu8aATA5;64uS5TQb$>VT@w>>BzYDU)g(NC4M`<^i zfzBk zD5)O5sBW_55>tQn$tiuhqnh;lmV)8d`kh7zz9+`9<3>im65oZtLYAZdZ5o%r6306+ z)95##Kvhf?G=Z5!^ImBj4Z4}&7PkHRj@j6Y=PvPV;B79+;JfF)|L#5kMwB7$(_C7; zvCpGhw#|Z#rZgR2mLA-R+n7rf%=aE*x(UT4x%)3@A9~SsB8w;u8_h%^Ki2(?r;YhL zo|f$5@GF_!hJMCZAd~hR5$*4yRR0`_#FIcu!w%hbVwMrw_@lGs)w53@6*fzm6zV!z zs*`nfYY)v$=f+<5X)D+$fifF(B2QdR#?kn_%#Y6Y!B5UsaM{^P@GU!A%)Pf`OEtdI zOlP5l)UgHCi^1c1%@&oq2TzxV?4M06pq6+lQY|D-Sze&8HMz1RwLprWxPEcp+}oDH z$5S_PzW`Sb^*@`qe+j2t9_`!1DY*IC#JFvOpCV?`)stxm?e!VIIVona+4gH7;_t)| zhUkAc3Na*mlTbjMuWcDY0sZ7^9a?Z1CLpCPCdb@ffWD)0RnJnuhwjmOuMNiWjh}a_ zGH*^i&TX+ox%Es|P%i`ps2}KHttggEV!Git$ruOJMz%FGxF7OscHf|ZF7tSsiJNpV zz55(RXn4409YLqIbkb^%Gx5R=lU4T&XRX@wCY_af1}g`4QRExEJU9Q+S!8VeIcf^S z{deJITKJ?O%lIoo}le(mcP9JZqVNPS(U9PO!t*dTeY1^ypa9I z&EN1w*Mgoiq}(xHVZDnwJu#i?GTFsAkpx(~Flf?c^rHT2kZcQ+*av2hMEwwi@_)NB zhNfd-z3^a=VdQ=RjjA>B4osJl7NtE<%_H9s`cfIS2NPhOTxgQakw&#?iUO>U_5(Yw zz;2_#&of1mQ$MDvLG~GxiF{n3+Q#L);#;AI?BtBLch<1>8Peo5XBV*+mIhB6fnrrY zX08X>^l~BmE?5<70I&4~3;k|WiC}228N6qfEP@Voqf14La9MGs#L|zTGLzP0yRypb^f|?(%Z}(1d0~Cc_1p4&M5=`jHV?&|*-(n$rD%ze zXlFK2K@<^&%kGP$kSFE};*1{HW7BLp6BdfHPK|gYr^+@?^|PPRwIt$~z;x^3qjrOI zblF+mRi?*{A`Xoqr|h_sfG`L!m_-u+b9t-W)62o+jq{vA_@viqIhJ(u)U$^s8}CMB zy0@&HJn?W8H2Q*U14!RTVyO9?Fy0^OonFIjnB3v#G`UJE5vCEXyA%sX?qJ2T7RBNF zH7hq|KWNrlJ+z?ucI{g4O`VUmwjgGZp>M%>nvN^>4y@LuDS3s+lY?3h?#jLwC7CIv z&s%yEjIP30Pwg}1ub0(No89*=%(Ic3bUESi;cj@10ToOtUM~nPhxhD@xQ$DkmGZn3 zrSX-!5$cb-C2CBZ@TIUqHFvgAS!Y^DN@SCSA6ev_&rh344s_{2>LX^zOj-HZcugtz z^HSI*=k7}7y(dkCns{` zC3dad{mPHKFe_b*CsulOvP-u2zB&YQPircW<0EE5d0bat4q>hP{!bnCN}q|>_B>DA znYa04)zQ&HCzpUK#7Xe7!%@&>NBrg!>W&_~`AdSK%URhnG&HD2pM`6HD`=d0!QDY} zaPFcGFz`jR8gDkT|F*g{ox3_~QB^Z;ScIx&p_6q?tEcu5qI|jU>kb(E6Tn9d=2IAS4}i^fY|(@4;YZd;M|u(a#p4LDt<#;e+{o89Ix5L z-m--GFe#kLlXkcOKzs+IyC`l*;K85& z$&_(bvB8{Ll-fWBOK2X=*%4K&9tZ23gXlHD9O>!^GimdrCeEEZBZg zqx{`F}V2SY=1WOeIDFxBCyV+~Fj{_5V}Wl+af`-YI2rF}!d zRdU}DLoT@VYr?EhEFBJ9Qcqy+pXVS89Hzq0vjK*O1`X*DC+s~D_FYfV1v|(U!kNM& zlEW_H3w~co2RxtSO^3dBSW7t27GRKy)cuCY@=}KH8{l5kWZ1#BFlYvSR=H$BWFp(= zbi3ynsju+-pdNF=oB?u-Aah6{&giNG>u)`k21x!L z)0brU*sDFzA6P-S6?15)Ob;4tQwnFTwI?qfJUbNYNE+xuF9F5sV z>)njf*@nd}R7?%Jp136#R_EY`-2{goQGRNb<&|Ucd_#O#_|YK1F74B|h1tQ&Up#!! zD;CM=lL%abi;bSKS)90%=}J&I-l>`aw`9~Z7hoMwt|}$XrJCtJBfziK)9CRK$b^I z<2=@fum!69Kxl**`?{gYDuM$QS+%SXq`%Pzm7n#Y2$U0L%vl(y!vK8Erc=Mv2Td|i zhnZz{SgtA8O?L*vr_&*DssSO;oX!(1)Z6?2X6rdu9UJIjlr^~gEW{XU>BJltlLc10 z>RAQMfm&hnJ-L|^JHT&!jbpii@}%y3xIbyefC5bdGo_+Xl(Cxi7lZT#)BlJ(a}y;D z(95b6Xy^F+8xioJF`5NXg}&_!bPt+2LnTc%aFn^{t`rYYR(NZAaSBU>+yNEoM0W<0 zmy+kl$;`0o?J!uC+Vv#AFS94PfUNLzoosa7x`8k|?))ZLsEUg2tUJzB9;idhOeSu~ zqrqvbU0v1@tX)V$MCgv3JEW zi;PccJ6fgVq|FqsiV1F$kbAnk-Pq^(@>E5+N5jN4#aS5FrSn68rE1K3&WjBzqwc=f z-VODJYgFG(LZ*OXrhN&<(w)bAy6bTJ(Hy#E+mqc*oujLcABepqIYX5;SW$NOp&Hpo z)4GFufVhG!%gH8L9q}0d>zOP}yPV3YpguuRss4Ag{;E(5i;AHPhY1mn zlJB@xukP)~y}WsF6`2QbL>p%qHNl(kI9lZ`GvD>6*JF=+K5!CDdROpZ63v&jvFU^A zzHwhL_NU>_O_7-_X7M|`2Tk~ztKB7ana^otlM)A($dvQb{KRG*F2eblojiy_r6KEnX)+Yj;+!dNVe&_o0?- zsHE7Qy#hNC3%KqE#uX-!?t??HjhIr58_X6;Nm6@Wp77~shbNCm>idZv=J(KB5%I?O zoZ6*3h>aUCMkU}Z!{A1On6qDrH1h%l=VzvqMZD0?))OByRPH=?@lxI$C#h?ud|d4+ zH4ljH2oGU4e}2DPWq2o0&b;FO@F3C6qaKg?L&h?T+(uzpSe67>v6iPAh!!VT3Dt zTnnY$__u=s|EBT!UpOf%fo9|@As<|{3LPpXCbPp* zNZguxzxy02tOV7H)7v@09&vEAo>ij3H-zMF408Z9YDgyYvE4l~J~5z<1=ui{LgIKo zfiIK1G(XKcGj}<&6T}-h0XKn-UX!MAC;kh&AGCXX7d#J+&5wl*Z&kn*psCAWZb^%k zJr<7u9WoC{?aR&}%P$>L%=mb4fddnn!f=e_@GNmaw`JZo-=so9>$?DkU~zy77MZ{q zU5o*XZ3{BKyyN3kr!oN1p`I96EG8VVg zu=!>qXH{xg<#V_MjpmZwGd|xtr<9n$U&w&gO2A8{nE+%bozceRhqbc-GZJt}7rpji z*}UURl9Nj;qc+VoOb764dDda5qs*55MsC)5qgw???*eyb-#=nrF|s64fqNwJ7dSJh z3C=z`l|#f#$OB;X^_|=t5t&yBbrV_Fr)`KcvqMY7+wEUR-s8!DZ~_Tlt4ELH+-kwh zQJl3z;4Q$RIqQ8xWQbtq%-U!3WSFz_&_mk1rt+A5+sg0oQ3RqHHY`deq5&pcSa#|M zzHs>c;FTJ}gI-nWJ08M-bA&YGAX>xmQ=I0A$=#?sK&DR$vx3>_i`6)OO~I(v!0#&X z0bE7^z-!;t0hQSSB$6|}q>5UrIeOJ2U=qZsJL>5&aMgGQhv$WD03PU7ltMqm5}v@! z9X9E|J;Dy5qXy!J8$zl8P$Fc^ss+t_zV;o7#&CS0K~Lr)cw?V$2$k`LqF`A20YuZC z(X8Wzr;$g!Agi=yf)`vzcB4X^`{E|c;BSx(4o7k=)Q?*_6Q1xH3P)kTB$9C(Zhu!U z-Usg1Fcu^&7&16WEuMq(CiTcd;ed@)x7mhV7MJlgyg+UWYr&r9L_mYubX`uEYTQf^ zY5T8rB5f*{9&rDAZ`1Akitt4YM-8r>BQKqvUm95feaOQNK=-jJwP0T&2RiCijI;Ad zEyJuM4Gv=LFf5Tt5`#)oqVrOhm4W=Xx{&x+U9bXpKKeC>NScs-34MK${tH!bUe*K? zpb6kR@D43Z01@soCV?ZZsKzWwo=ie^=>LOmOvGP;obQnIAO0DFcY(9r2f`9>5CqA1 zAlP`O$e53`||dQ0olyH+AJ4NeVYd}ci$FG!X4s-a*w z{8}H8jQbqC_n-aCWf`VDLv=yaMP5l3-*Na<`?$IS2rFLV7z^~~Jf`X} zWIIq;z{DQ_2tiaLeO6;D1K`W()64MF3V6iPGha9>z0!us`8DiNA{S}G4ZxVg{zXO8 zu=cxfn!${%CYs4V2Sv8F_uE!~$9`hC>WoFn5>D10dM}Rlw!RK2Sfd8BVF~bujnYRjNq(?|loB1w& zfC7E&!IN(~pbp!U;8#~c$56MtjI>ZhVzw}-+5|z2lex~rF^jt314u1^&m4S*XZSuB zw%rop6=RHP{o(tYsgo*5GY|&IZ{jowHc?v22Z^#)kSO~ap8R8;tdSL4@|A&XaPAa_ zkCH)Br(-+~0Oq~zgUd{Fk_SPum~cbW7W{R$3ZAunZ&H=`m*8lagoFkGLV;K^Ezs!E z+$HidUV{dprQ_J=wci0733tV#rVO(l8XO|oVOb)(zae6(2#(YZfZPKF6^`m+xx zh$9^iv%ckj`O*P3#2FCPWRGzCaxom?RfK2?qa%EIa>PkvA{w^BlZ>WSIbsXg1Q1|9 z0Y`4HxgX4Wb=ks?!`NnS!R*n^knRzRZm)d!NzNP0U}uRGkZLiIX402+wCOvf z867-0kBgiDI8dLB1c6Fqq9pqUyOaKaG|=z`w!Lm;`uRHyyFq&E%npZ8n2Bq+5e>pZ zB=jL039p8AJI{YZ^isZtXN-SQ#Ws*SJ*So<)hyqa-w>0kY+^VxcmcaS!=oQLd%J-? zFuIi8Jx*!Z_Fchdk?WvZ4hZ@x$EQa?Hk~HS^25@>qc4_@|CmqTv>@$OJ5Ed_Wg*#t zB|Cwd*PMn=nG9O|Po0?8Xe-(K*Qo9Q6ND=us!N4_3hJy#7X8>op*ptn%)Plz3=IV$ zgZ%VKod^a=*cn$+_5$Z_E|w!G z_)DFbWCN9u1uAhEgqQ3R5aSNX=7WTo+P*mw8YlI;;kODroR*B54aY?F~mQ(ByItLiAqMj7$xZ@O}zZ{dp!`4fHL?Z*}x z_C`TxaYJKn5bd*SK1a-?ip<^x;NHCqIgS_b1P$+?LGrK*bhQmO1}V%3P{qM9Fb0kj zhG@^6;T_@g3#e}hO)zE^4L$gSUi9!Q?yW2{p@N)~Ulkq@Ppm`=xW^ebDOw&dFnT?9 z!%{Wpsfq|CtntR64Ogk)(gxk|wD|M8sAJEb%T4A*Im~GkN!wrCS^X^Ejom;>Z&O`% zh<_BU|II0~o&P8R0HFN}v2)gnlf>4byjHGARLG}&0d&Fh&)O~&#((Nr@ipY-iYe|N z0@VkvH$Q!Su+2AB>E;(=vcS`4kE}k;V3{^pKvW!7-tY~v<^fb)gqk-3=!7UY9k=)h z*ZB@N$i^)Ou=mSM8iPK73$~yD^$9beyOb6UbR}k?j$Of0=3^6szac=)+W#o$CE$Vr z2$uFRfDNJw#Et!H#Ee@gAumI|96P0#>iQ}lw?8?K^0f87tCcgm;ko7t z{JsLF5-qp_TK8t&eO@m|?)DE3@2LBB7pC0aFA$|muCe+uEgpbm8Nr`Rq?jDuyVA6b zFh@4}g}Z@~*9pF~SD~M6$|rtsz!P-Y>r~~p3hoE2m zTzBZ_-wOhU>}#Z2CJ9g){PGOKe?Ec$hLi9jMi{;TmTBB$8ADASreO9^0m*EFU)p>_ z%no_Y0LjDke8B1z>81@eEj40;IaUzV*D8UV*{*|-bpOVQEf?HJK zXwN4g?VhU9zf+r!&^wc`y~o=vVw1rN0Xqx0=9v}@*YQGcyB_=TS}@H6UP(HvmWwiCYK z=57f=%;OcXBJ}bhhP{ZxlRjboh~r44u=cQ##hk!>pkT4``$YUv8S%bCcO`xXp|dCwmaFKkuNeOoXAw1F=HB*s_-LiWp_D?)Kr% zCLKR@)uEJE;puWY9n4^DjvuVye%SIfawVbpfT|`j?TU$Lv^hGU%|dV27U{?LKCcKr zp=uHnTDx~kDAqA3wO-C_0f^7?^OX*LrJ?%M%8-J=udUM?%+I&=;qPtc{<(C1gYB?2 zq#a=xFym;f0SlB!$#(XI9J*_LN%(NEH4;AM+crhF@{jMtQ)-HZp97O(=QM~;P@v~L@{cYLr z<(KLb*KtBh_IBb;X}C&aAW3338!POaA@x2#S9E)K=H{IyCqIB%Nt-*=UZO9mJR93HJL@_aL^nw1VeA@dOi-2PZXFW~KYtJj} zCKy(_h$U@%6i9C`M&MbFED1tuY%JZ(9R3n(FR(MSnnVk!y!AqaJNm0!X&m1r{^|LdU6KequI`3!UB7Ko+JMuQ1JuQd0l^*3D7EN}_$96zna_mcKB*QE3nOyjg- zVj_E7i6ZP(m@1TCi44=tF-eN zM&D0PpWpK@r#Z10TMR#bg~a|sx^37b`_v02>16P! z#Nn;z?2FpScQHMOdT{FJ*DmZlmFk}=su+_jXj2r~xmxx*(8q=N)?{k1bBEhNMu01E zW1l-eBP6qFw-V>__&Lu*NxQpke)7^MfA-Qb+9rb7cs=u5Ig3g?Bd_l!Th5*d?R5Dc zUIzWMmwE6F@jIjGf39M`#myX|xiGYuPx^_WUD#?&AZg2Mb@Kx;sTTH4rn; zh)Y+`D`XDZ#?S9nxwTc^H`Omd)`??9I`BDOaTldsUdhJG#4;tdQWZ5JPaiL zos_nIX5ZzXtohV2uA**_)cughpDRbWbW;U~tZ%6ESQ~X;OqM zhiYzPt1%ZYQC_H9r+Jll9(nS*Vx3dR^T%P=3kndf8EUBe{j?z$z-WWtv2Q$Z#BYD%-UnbWpZ_ac;={3oI0 zqvp4^AMG03BC_FHY6M}@)-m9`!G_-@r5b+CiuK@D4w*#Ep}6YdIW4J^~y@kuaha5T?PMEUTVcL*P(wH8X3 zy_%Dvi6k^FX?#wn6(>1QX3H(`oGAstRrt` zMjxR;0l8>F_k_~{*yfwTrf9TN%2lbo)z2S>H*$k{B`}j=i|OJ&%z3UZ>^krbv6iV@ zsd^w-d@lBEw!O`ub z&Nr%lhHL5G@Al^Zu}-T0S1#~O99v)kK0aH8nFp<};5=;3aE+6)75DbQpNMN6r?%@| zj4O@btg7mN)+n`n^)}j}BxEqW@oS$b4Uql2RcWG_8jh>-13(>g50v-%CY#UPe*G)# zenhI>DYv0Z9&%Ah>&_#*Ff3g>mh)0wp6a!=jGP@;%2BA2G0IU^7M)6WKl`!1?eOb$ z_e!>H=nX+8#P(<%$rS0Z;K!601%-kMU$c?AnIkGi8P-5jpCh9lLcaxUt?_)e~! z&q%Q!o|mdw^!Ia*hIb`{5oqWVNGL9#HqCp<&zq;cNp8t0R*{~8+ugQj%VzgaXhnV) zdh2en;aCzfQ7Jr_L^BGNqYTwlZy^Zd!c|#0Cfvc;dGv*lj&({%&5nb$g6})Vq6p7}>P-23mM1f4bF0v(EqTmOiQ^&VNZ<>tS%QNVDc6Zgs%rjku zfvbpqP#&fl!-sbQbE38xU9*(W;S?|D$_qTcukB`tZ+i=EbVA?jam|WdUH5ABjh82# zJ|I|oyfr~nqt8p`thp~G!?%Rj-3rHih&O0I-FK`^CAP13V7o%toz(aa)`Vo6w`MjC zeU~jy!%{9sQntzkUd=)Cj;`VE`FL2iYTfOq+eobwTRN7a?6?O)l>x8WYsik{W&(Bc z^~VOm>zm(b)^duemv^=t<8lAg^fYSq4yDam{->+lK&ST+xL#r}CpDL&P)VeZ5G($0)GHRsoVqFd`D;(Gp!;~;k#c9O`_Cp20qw&O9ZEqKW$CFD9L z!NuN%3T;+7XB)AZyXu~`@w&YqiA7$|6l84?+zyoR$z-~3IYwlT=nck1gv%$@$^k`% z<@Uzh;_ZiTvL1>|YF}*ruz}H)5IOVa;}!ui?+q-*ASb2BoF&iXt&mWpC5Aq`qI-re zV0LNFEM$Leu+h3KmEgK4bv@Qwu8Sto5q8{lWJVwm=pDR4lQ|$5#dWgE^9Vc-!MKA% z#&MkSm))87<1Y4W+uFYFCE~MATg1j=Ji@+E|5K6>tY#;Q_0dG&k-;aK={w^MZ^$1$ ztQfkd?*QM(^nS^WRdGYNEML#Sw8)6{@+d|nXF02RnHE=p>KdVB`0t*BwpzSSeEj`6HplLPbyeN7Aqg;%)a3~9&T)IYDIdOl(F8g3l8 zk+Y^Zf$aLE@C*Fz5}IRT!PD^d6}O*tb(+_6QoYhYmtDx8 z<-UKe_Ve{kP6vCgJ>$;eB;xH^x0r=Bjbyev)0%==%aoY;=;;^K^d;j-^w5mt1xDQY zV^Vo0q0cr31Rclg@9L;T93WJ5$!x1~O_zxNoSY z3tb#K_tMd_az#TqsT(x_0#jvsD7$^~Hf=wBmz=Qg&(sW(efr&r2*G3)@( z*_A5BJC$t^pamvZdD42I!5_D|2+hLa*D^irsSWou<#UwV=wqAj=j8eBN1Wp^91N7P zezKC>Ca6EPPq;0o4U9-*109Djnf^FF z#Rvz&wzh|%&rT=b_wus|6*Mk8Y5L&95y#_us9duRlkM~xM$kK^E@(IFa-keZNE#P@ z|2!}K>3ICQlFN_1#JJk*Iuyf3C}@b6#ScAVESsF~`{--Xe47HC~reY+WIj zRwNYh(bYb8&7~s$K9O^JLwD`Zdv6-bzR(j|bET12X#g1pc1L2@xL+BkCZ^IPDf9UO zH(4%TZqIgSXiNwNDXYCRe_a*h@;F(NcUKvho+Dj`!p+c=(zF0QX7V%pj1;|FGhJl+ zX_*I(d!AlZzd%#Y=)H7xUHTg>6$JMPNfhpaww6e?NdQ9CM}x z?Dq59MG37nyIQwhaMc~Wz)gz-)hkb~1dIkaeztRS3EveDle=^BTQtSV=AEg5N&$r! zXMT+iK4$?f?~O45F6_BV~5_!4Zq#wBw?lE#Cv{&FeB>-O9k#Ftp+wvo?j?WLd~_2eH2l7Lss$Z?6E;t z&7-upyv)~k=1Y0M8vX;e+kbhqii=4fXjs$-b=i(l5G+2UpX42r{=MG(U;Z}#+Xwz* zB)@7T8|gAPA8A-q6ejrDxahhFROb+_?bvzt8I%#@@~e!`R~jaXfhB3+2fI z>TBK#Gm`(!x^zRy6WNamwf6F_uErKdjj6wsT={vg)@YZvW)deJV}tV}XY&y8eVbUf z(lf@!($mJ@C#3gUAl$B6CaoUmaml%HY*nz}O2lpDN_6+u0p$=_#Q}J-iHEcb-S&Bi zPeJ$1c;B|iZPj0zeDt4eilB(y?Xk$-do44fFp#$%M>|=JVN~Wa)t*?uO@tjx2iGU{ z)N_OyYs!nMRjqE&rGg3t1JUr^G7na-yKTcpXvPC;6T+7GLfB3CnAi0t6tZ7M_IU<2 z=DqGMFOTkiURbj~;D*4#%{*JwC%RNFXo)$7vjvb;WWfgR%)e&Nwp~B0pnzE z*J#F`dn4~3#P?fpMQ^;C*NhO6wK$)P@R**kr^!@9*|rmf%6_?n;ef(-xTM|5QS!mD zy!O*a)5W;jWvkALR0w{YlK0VBgJ5RBYZg-o9ql+i_7RAsJO;YFCvQGf*P@Ssg_*DX z=~JlHWa z5qt189PtAoA!18_QaBVrfZ$%HbN3O3RTyB^w1&w!yJ{qC8ISW^CL<-hUT}{hyFCU@ z=5nG5I}K8H*(Yel9zc9U)X7{j#d26^EfEy$ME81$8ipFvRH@9*C zGZnPVinHn*mm=BMWaqx!R(s12juo71;2q-~0$Zfng3+*{8JZ=k2%5o#H4t9=Xqx2X zIZXNft&W0%S0!oHv3XClxIeOXd%8<>Bq2kU>@Ku`ubJ`w2in{|-xu!oWU~&$A}L#@ z?5gX*u)WMLM44Z%UI4AQBu=V~R_2B#k7VjF}?|x^X6{7rL zx_+QnzDR1L@zw=T@#_tJvpc!l;2nfIR6eN&Ddx^wMG{ayL@P)rjek&*<+cBI|20uN zsc6i*#qcFn$1Nf%^GV1XoYy29anAv_du$bY00kZ1T}I?r|3Nr1m0U8teR$*NQLzy3 z9Cv%EMW1(alIfBw_Hqp_nM`J|vndCR-p$a%7aSZ1G&Y%hgqG~a?pDbx zJT&^~YK6~o-zL$(GKXtt2+bSaZL9lM(my5g)5*R)Jzvd=E`^+UWn5l>eSGcXtE@e@ zhDy%%mhuM5O!~6U5YM!ah3B&A{! zx?+dZY6O27BWZ&Syqe=+EmQ7{8o%)}FZ(t%QgHB9@+bGGt^?uM#}|ek4YUF-olano zicIaZ3H9;o=Nq-(=h6M1c9k@}i_0nO^Jc8vGXVDk9GCGc@W%^oaoHhfrLCxraq5{R zGogpC950;5jD*C!l}e?BK5AS=>d>sy%j86si0UTmZa%Uj;&NOa!s+pFWQvS9_5n_j zYqp&&a9QLP$sXAmVA^y6JISE`5GZ)6xLIvdBk8Qh^%A{nm@ena6)2{xMDPVFHi(>U z65x95thu~Vr82{0-`fxF&kLHH$|NYjy4e>}LdgE~ z-4;R5Mnm2?9DCi@9#m_>yO7}eWW)REuu>T4PCyhxOhAuyaAltW2~E6~v&Ri6lEv?j zojuLI%x!J+AWYjRd@U(!sh^<_-Dq|7W*m0dVx^L0M$Iy7-^(B-Cz0F5vvRw%%Z2Xc z=RvN}hpUA%CjqVZOzpt^7r95{rrZ=7wf7%#w?j`_XG5RhoS|DsvA1y~Nmspb(Ua_yaHyG<>7o1s+A#)l*C|mxXBBhUfFgB3CQ0n}vU7df6Q}@^8 zvz6(r0?)~*ADR&C8#s1ywGDyd5u#MxckSBE!r?=CM+`*@MDE7V9D{tm!aCb(8E{qG zsK?4X4t?WwrghEr`-3ODjsVQ!22qA4ift*O9*`;l1>V6WFX}dMnQ;lDm0N9D0W#4? z2heeldyiZRs=7Pxqf3S+x4r=noFV(L+yAz-`75IsQAdG~?u5M(KCec^}k z4XG3P1$hVEbS&T=J<()(kj~3}j*fFg&-MSXrddwMy~lC>{sMAoA%K_P^^A^~g^jAB zQ$qHe#L;&Jlr+`z4AdR>1f@31R8Qrzuj>+|h&wZ^b7LMcMm*xK?72M??<~o6Hol;> zH`)GHt-nfM#-~U9c9uokwIrnh)nGKqksw}L;+~eCLhHQ0tomteS*170=fJMa7-x0J zja#fVmCY{DM-v{pAim!gdpk>%uo+)D$&r5tb@t|BZGyzMvD;}gmci#-wIVT)&(5#b z_ndqgVD#!v;j0$hYrFRq=QFhC=t3Wwu^Ttg{5V2(GVbVagOLUOH;l$UKy3Rm zZL)m~5r-t38tQM;5&-+~+z;!ZD!@FGrRx6f_c7ln0Ox?`{zOf^`U!AUAHWf0nd!;h z0RqhqvSqZ~5ZO-eTsJF!qQ2rC>7iBL(^IxPMY&(8zo=stxl9bGxe#a`YxdKKs^(@` z6qp_MwyD6B0-Ysxb8~>TxWJ-Es>+{@BPZ=$U$$#^_IJvcmCDF}Y)*xL9{j3b7r090 z)ol*hN5Cz8@p(>mbm+es)KfIns&G{yYr?dsGiBnosQ8S^VJ*ksT{lklw{gQB`@b*^ z|L(w>01P!eAOO=0KS;~C>g&a`IFeX`8mU|p$)M!Fy8p~O)V^I^EXw1X`Fz*Y1B~m6 z4dQ!vM-k4k6iNR34l?qXB;3J|8n_tDJ@d&GA zr+e2|uMFyc4!l4`5mI<1?$x{oyU3rfFRQLUtd!Pt<9Zo-X9Hqx1}(_IND<2-i<7!H@t?qQXBQ;B8v z+!{~C%tcAbVKG}GU%^hZLYEEr&^%(%bz4F8 zXq?TbyjSNArrbr{UdHPk_RcyJT+QM9&F7I+Dc|LLW*#O|Sfu$t!?aWE(>ZC+tHH|- zTO%Rf3vgi~wgodHGqeApS}PVd&+i~IC=vT{&~P9xR@(67L(|)LvjZI=4E=-ey^o@i zk~dQX3w+kSLQnwJZ)`KFFd@IaQNsICgzS~JH^B6OQL+XhvzCuJ?B1usVo_w z(V)hPtz8109m$nA5o|8sH~q&b3gwo^$Xpc83CF((7aj1bC_b_K<`my}N>iiCrIKvR zk$K%c;l~?4b01)wvzH`=5aBs_*2oVO1L_5<*VT#67Q~$ocHQS@oKi`Ine%plsZTY9b{dX1_l8F)ec9^COOcu3I!Sg|EuM`btoJ#2re{w70v`Z@t3TF$bZG|%uHsch5Ay4c} zI}c%}%8k;N>D_*k?ZTyO$!8g~7uT;gC3pvD+1qKkwLEKo@kXpRiybvmESnr2+f#89 z@n6DT+n+{H>F0R;AD&8oJ00Zd2ILI3RUUS$>CGsV?u>R&vo|@Y`S6f+SK(y4k8JDX zON}@D=3He2g-UvrmQI&GFbaq%Ja!84M*JIVV)22_Z~B9uM%+W3P?`jn1iIoP96sQy zh=FmdBs~piGV8lo)%R^URXRpg3D51FKyH|UShWykcrGj1opv<~cWd3AthI+FeQ_kN zzddgnYhs+4|GUi%5b6JvaZUXn7%qUEFjkZZP&H{~S_kpAm^g72qC(0u@{LW?R51_B z)G4io@&jADN4#HhB|RD(CQNu+X*3yCUU<@9oJWxv?EiLMu;4O&PTE6xbzxib6}5q^ zjYo^3kiFF7)1J07tIcnY8hdde-H44&C8s60axCqbWMC-FX$B-$v)#En?WG=2BQCcV zZLd%DUGLVshi?-q110Wtui}1xCXj3VEuVXY{3);hyFvV4%sT$gEC!wkpoSw50423n zkkNrelUBG)Z$81d^{L}PrS&ylBj+Lb2|of)MEsh^@++UCg^#AhC5p`4&$xZQx43X? zNc42+`LWN%#h5p8lbR+Uppv~5)!ovG{?q%AJLTG#FAVye?r}KsM zA(<;K(3OGF8o#(?zZd`7sMSDT`u+=#?Q@;|UcxWE+q=Q{4W*2=?BcVsk=1%si!7#W zi>Cq$0mY1mHUl)19{26s9?(ekzh^<|585Kuyu`V;k?5~izme1UfsE&DHLdoCHtUww z2igK!u^niBsbBx41z!+Z>QBwWg3~C4!F6(bKP#>LHV}i=r%wR(+2>$qYx*J#JKOcf z<0}Ut8;V6=TaC=pB6m`bF47suo;m2N9)s0>+>LIeXP%oy{s&h4JLV%;pR6^h&!j^( zFk#}t19d_K#ME2_)!2!ojXXhkn)GympMKf!^H`G$?~SJ$(zYB$N>3ucYTF-nft-r`| zn4z8Ci!H}3FH$a$?NL@UldSU^qV=#$o)ZLxi~=18FO@Lu{y`|>-s|%UW3ScacjRj} zXy*b6*%=e4VgpT$*cRlgVJmaVD5I`_* z0u7XcT27!5D|}Lf%Iq@bbtjAUxc1ml`H(Ix&9_Hb?MW$$0v#{rr*+=z;6npXtu75x z&WE%k`KYI6a6^={-Bc?)j{oB|FO>8%V^Nsn#S70%9!dr7e$#w4nQ6qY>}ak6i3L9k$@2=xzR57uFaPTD%awNcHQ62BY7i5ei*XtLJI5$U3(^XG-P7CmQ4ujO%uYE{aUs5 zat3{0tc`7z464Ngt@A!EWeZ)RA`bxfP%UYKBk3b8M$52d9#=jXc@q=!9m1~qVGdW- zSE4WzzVKF$;kak7-h}g~lk&Bn-ZIJF#zQ<@k>zR2$0+Ae?u4!u^vJ3~kJiB{4ln6I~tN=MW?2dX_R500a3bzL(O{gZVGLGJwfB*{YT{@zeCQ#NTAMk20i5^i4)#4%G z-cIsTkL-$PZmQ`|EAY_b@6{7O;1ZLc@G`7wMiAoIA<4@yMsGuxRkt8mQN~35WmSsd z0>2fK3w1tW=TXa*q!@4>tJ{)jyfD(B)yIFu^MuXR!3DO}m$!&iSXE|GLIpR&Bnm+WPjnYQakUu=wI!F}OKzif z;pcr>qz6{6)FuW!EM~+URd2)Y+w-ATFy06Az?9jRa*|w48YGxBYl=DIp-m+m0W*zC zU%lIc@*PZg)l4U1q!lHUjd?ES+};xy@x|-DZ#I;fTmbY|@wBRYC&<1;-6^YRIv<(4 z%{P0x!&1jv8p4!i|Eyat?pWmo1!3ZXUUTl0Y?bVT~iyi zXz@v*^1zCd*TIMb7KTDZ?1KZxle2|*UG~BZP=2&~b#lZE(#@CTJ-+;9UL3>2r?TCX z+h(fex`c(o)vxZM1#+)5CW7bppecQNU&-ez_PCZ4^Ee9=@%Hm(!R>cWrMo* zoOn-6hgXGO$Bs}&L*Zkhr84_N;mNdgFR<*!9ZwEaVn)195lC^*#>e$WEYqGSy{f2Y z@h^%m*(+6r_>vnUc7So#wq%$>dQMBCtB?ypL@UEa0-95RNNb-+?JE1YUE4L1$?M_s z%*o1YLZW&-E2zA!YP5J4-_4gEiHRv zb;O5MkJ0a4(K`IRUWAZ^l`s)2%sIpx|8C_u*VsF_zu(;*M$cvMa@-@0lDUW=n{V+7 z4Ur}XPI*87TF*-4-5gn(yjB&lbXWCb))eW3Lkh7i zq|~o9*2{Sb0(+feIhjme1BN!xFNE!c%TRZc#fr&@>9#1n=)#GB7jx!OqsvvBw#8Y! zy{f?kg9RnRyTfM#zJ?!Q{4h()0`21HVP?RQs`I4CxC$c@>UPcSrhf{Y@)*4zZhTNp z6Ou6j=LwS4I3{iDkuxM5Dlr(cGh`4~m6lKkny=W8O%;nw4K{hPkLPW5FL`lK&zwU| zg&ldnBzCvV%kNBV^Kf+V%irUPZ)JE)y^L8_fw7 zV$WLl-VXIEesS4?6lW1S7k{jCz${)Ws0V&^Y3ebfzsHo}pbe3`U`rxVFEQ{OP0-f1z;5%^Ip89E z;@LmmWdFC{32noOqu*P?=xxR@noMjc;v9fDph-A<4t5Pn2I1L>6|g>zyW=|%V;(=m zY>zafnL;pY8b8msWyI&O=98quiO{)mjjoAl82&5>bP z!CxrV-e_Um=S1=%z2yzSmJOejm`DQ=rBoQz8>D(kJ&|-RZL8TJ#8MDYPHiDf(i=>H zcs{Zo>l|23<*lZlNt~lM)u8FO`?DH4-RalioWEi@jEeN#`QSsfm%c+TW$FSvoR79Q zc=+2HNJt21l34s93iOwbysXCxkCtT+!f>&&kqY1#~0wovzw4f#6fEO41L$u zyRAnvw5eN)ompRz{DH@RrlvEeQvsds9uS-bqF&fDU<|nj7g4)`wkIepgDv&z5qO#m zo$(uui$n`9ScL7Qs)Dzt(sB?>7t85Zd|TCbG^YSN5Mlp!a|*z4{3#hsDFT4|!~j@=yb}aUJAaSdniki#vX0xgf~<=cq|tKbk`*K$11X zogM>P{C9}Qt-+_$8s4x0TM6WrYJwyEZMr`_k?*xrh8LL>M{A*5P-hflu&nLwCx4j= zSI~{B=+9$-XI*;l+;_-`FHH;@3AqwASf*0T|+kb(hx5z+M_*wUyZt=%W6M^6`6mBW_Kb3be^e8_|!xQ6^4-5$-^c zCAVNs5#J&H(U0M{lJj2?@E;!5wL}E{JxAdW;oh|n@N|+Q|Kw)T_QTD>t0ekQZWbL{ zM}VBcCLT%QDZyoShLkiQSNITZfMb^XRmltg0eDV~rEJQ)2)RXnhgg#!=hEB6jJHl9 zlf3`StmxmgAveAMRWogL3K$~LGeM@l?L|9yi{=xC_-w04#J!j4136Yh2<|a_qNGCG z9FL7cuHArDz-aM)lN4`cIyr&}raN#9`m@~|B{gaL-o%a=JZXVuD!4}ezVQ6p&JK_g}@H=|wn-@qe(~{c^$~Abq?TFnDlZw%%C>;SEA67<%aMU}dY#h{%b< zL~I)(F-zv=Q z11_gT=ZE|`xr>%%))S+Ky#|ji#(!geC3#mza;GVC7QGWuPDPZvgyxaG^T=~nMDfJ5 zGW)NK1^U8oV%tA7M?!WPJ8eTBJ^)ITHot2qWBf8nKG5~HVL55NekdgTO1nMQ?iPh$jv^bQ@jB!-zrgi^J!j;(lp38fT z9!Ur^?)5Y1hwz#*zy2To%3tsP`9qQI4@I^=6xsezWcz=w$d+H^W@)o>FX*xRk+h2b zLr*F*En-s>PsuiLy~Ag@GJe64eK(haff}wJ$mJszadd?rI>rCVm$cRJE=y?y6Ozv- zUu};mhp?dhaS#5l9)5$u@q_zvEyp0-M;NoFL$wos*Khkzk+gnI`v3N+=wJNmKMqv~ z{y0>zLjE{Z{gjn4{BfxI5l`0sUmvPe$0hZW^;u)tl#et(e5JaXOsMtbt2^xPBNwt93z8u)Fx0ujYv?{A}ZMFAY;XaYu zzbwPc;P*!~|D#0qM~Uo@64}p10>(c|WWSIa{-HlNfaNvt_k94o*BXG;+r2lC5G^V@ znSoHe^+T#+z^OMjUVa}}=|V_6md3gXBOd{k)#=3^dP4*HCso!h z1Ua_O96$qX-yslC5te`v0LOGEaFMVA?`%u}rA+~S5)yw~lCv-S^Lv9R>Zw2p-%cU?f{3_pjr?3UF!HFZVpx0zTn~OSC?s0$&DU1AO3d3Pj zS(WLMUG65e4^q zbG2o*XAND8stjl%CF;R%e)4H)XPd@_e!dGK)&BTkK6er7?bc1UmgSL4q~nXz1f|cLSl?3 z_i4%CHxqRp?uLu@zf^;6k$^XLsv^Z0nm>x6-%lXm+M=IRyWDBOabu_+)XIlgVY42a zO-o%z0XzZ-VeO@WB{t-50`j2rdqYBbzkYkkoVe8x*_3h;wmyQO%%6KtfdoH>ZY{%T z8wb7whyFU$yf^9_{>Cs10UFO4#FpwPl(Gl7`U;lJudiU9yE3lNk(^Rhu+t9$`An$t zVpiea<8o_>T4!dyj&A+1dYn`K+3K;1_RDX|O>H3^-~Drg)KRB04`jUBn${8p(Ql>E z-E)5<#kP$~bCYoChTT#8t{N|I-G@I6ynaUqdH9I_~mT7+mmO1=-9aeK%C8bl* zzg^~ko#}r*l_0-<3Os3s&mkJgYBX^FE}eK@2@{+DOysx@7GBOru3g* z$6uXsjv#ES9V4qvuOa4x0qk?85`FR#It+Sh3+82vcZggiA5^5Kx0V8w^bA`7Qz}`}~7xySnpko8h_t@mOpkN}m^w-AYuU?1q&*&a_O249eA^_dH@t4rO zw{&I`7P2b-y0eac^pX-AbPJzC-tR9!%e&LJ`~%_xw6&igKHP85tFGQ^Xzrhz^ABke zEM8(Llrah2OU~a~*$zsVDgiChL&;tj*hz4YI9Q=7D z{xDr_`(e8J>+1h$CAR-_SJhD`|6?T%h8d{?NRP#LQ#Wh#hd28EmwbvKW>egML7q z@e99R?EfHCiF*@1f&IX1Sv{8tMksmuSrjy_e*nleYx)N#^c*nx{F7%?;}4V12EfLU zNkb~o7T5sElx#Vw;^5E>H2j{rDc zI1&=__FsjGm=%d8bVCd{Krb;qLx0umX8yAJ!<1A?tqEb=MUzX%>AgdLMw-k4++TT% ztr;*2WQl0f96cj!!D11+;}_h?jot|ClKgSPh^1Xm(DSc6){7`~Jw7;c+Xzvjr2rgc z%|W*2joA3y)onxU!U6tH-;CO_i$B4Olt}g`m|GF#&0W!)JGpeH8K3bUwrkha{jvqO zW{*0|?Vl)30);;6{FFF z+ltYnXUguG_0Fly`qft(rXC%8s=79`ThjCOwPM;>H*+mDm|X2hki&1r!sRB1iWdSx zI?nJFp83KSGd`gHcrv2;W@6kUQJ0)-R&+VKCDU;R+lb|$S`JY6XNhDq?^YpxUG%%= zccNU&yM8Sub0MrvOwg41DR>qz8G0)MRxg8cB$p5u-`2wivg`Zm*rpHMq;ab+P2X;0 zv|C`Yc*rZT0*P;Shd z`ttP0%NJno1+OeV&WsN7GDHKyYIQ3D`pwt?;p#Q4+*;4r`(l>!X@0%o`Rd?870#ju zqIb)9;)y!2s{tO(=>qX6?fihkCgDfIiT4U^^#h%J*_`%A_A9;Ij_7+f@5*@gzZFEb z5TF69US@uhU^>YV(zP8@xg$Wvp0F_z`7NnD=1H<|Wxv#({c^L4on8WvnJ@6Cq41ZW zv3*9r1;>VBC%e3$ni6jpsnGbx-t9Y~^`YRwaxBX__1c&7enRaHJG+^nOFcx#R)i>= z#|_Ivb;Jvz_WHKEZH)5bJf5buF@DHwPdK`&85=M9w6&`767$x?HBRgaL*2gY(Gk@y;OJ8{d#z&VU!|?fGB|zK# zM^=rBG!uP!7L>dN@Iwze)!FP(BCT%<6+((USWmM-s& zcjiY5Uz(jWY>;u=EBd2BpN*D+bEkNXh$Hy|teUf+Xr_R<#cEjwc= zR(@jkf(&bS>!wme;bH$wVvi8l3ItZdz|l+$Xh3DjLTD$!*J!QJm^64rBViSqP!6ho zX>Cne-!=N^wW`50jJy4muB0p+Zi|HwSIK@|OXUD~S$PeBmxs7kP;L89y#fyd#6l=L zkl=Gj!^2n_RW$T4BBwk=dmq|cWIp(Dc%}~YNZ)<8C^*k1 zZa7yAxq`AIpTd_UOhD0$0o%Q^bB4*&a!-weq9Up9+GYI})jtz;#9@G@HB3uI&)W|o z2tI1$Zcyj^_rJjWiYiSu#N)z4IN}=WcT~Kt?!A;{n_?Un^>FjujW!>jkx)$y$X31` zI5Py$JLnPv-=LiCx0uQ{(=+?4)ArQVJ%4H~TFc3NJXOnd7wkVa&YI9jM5%uRHbPvb zlDHXmGuUj>syz0><>X)^!@Kf#&XpB6;?p6UMvyJcIdG{y^bU=uIX(sBjG{@^BZd^uV0q?m)z!SHyfJ9y#a|%r z+N);9dz~53ftl%IaMQ(W-p)s#n1|Hc&F$FY-g{Xxgm9t#0NhbW;`D3A-)e+eH=aq^m&w z&HpnjPqKtS$ufBKZT&DRgPV_M9@|3&lAWDO*Q{XzSXG$&-Z~-fZZ7y-`Q)F`5?i^f(nN9S+`2S# zNo^K=bP>1Bs+WsOEvntGYXd*1TnfimpB3z8&g*9G6b9lZO(bU`Q#rZ>blwkSt(w)O zu1%#`r5sPzmv3h1zLIrdM;cYrg=7tiV*Vg%GCDyZ`IEen#{h-NyBjN0*1w}qJXhl_MzsaF@V@Ionw)9mP zmirmj+Y-?iyM{-nW#YbhfdiNBr!Sv;3sY-r11hZ@^^$7t=7wrNV`Td`toHvFF*u^9 zK>992(MBl~Q&VQTSY2;bXjCPrjT{oUH?s!qnGWX(0YGB>c=r!f^nXph`NQOJ9NoYx zDkKfNZKWELa2IfO>#=}@ZRD~h5Ce`27T}Bwc)n_k{e^?@oxkuS`e+fLP`etUf*xWs z0H>VmgLBrK)OcAGZsR`MA;2CSO`ZzSr_o>AvDuX~*brU@0uWcnG*{L;|xM6V*D|*s@FaRz0;+8NGKUD(k?GEWmby?|n9?A>9@yLb^*x zsjuAQ>CGIyv{CE0djW-2E%r-YQoAO!K;C-B_z8VUam2hCR)K45r|Qlai+FhPxC%c? zhd=eqU7LC3bTVl_SF#^>*{(}{H;zDdKp;dcS%#UKh6PS_t&3>dp-<0tlr()P^e`dI zVOow|dm#}Cjm%+qay=Jush8ZI4gInAMx!jqr=Em#OdKYp#b}+%byyVsQfBL>VK%0J z=)?GK({~92mPIYIhBVNRVO_SjV=3-ubb^^mEJk`2v--?nV&Vy zH#+E_ObmT<;{+ISUUJ|Ja&(c@T3ScWCk(c2GrvXdF!Ytu5PMX0WZWe1%Mr6yy~yDP zC)v&vv&PU9yCDA9vg&13ViH|>ak4CJ`M6hp`%JO;c!|om`&;S6LuQ=MKl7bBFV*_? zl+#HOi`!3RAO1$0|-|zHYRP zk4R+cgt*ZMag|oh2*vff{m&H!=S$mh;d?Kw?f2IauYD)lII1-Up>#JW9{Q4fGQ3FuOX2 z>A=iFI$>) zAV%+zG6-Wh))3)^8pnXDiEWQ%$JUz6l!~L<*B-PBkG?H!*==&5j&YgkbkVO#U1_yd;gm0}(G{zFAi>+D@}+Fl*7crNWCMh- z8hH=hXozrAceP7o2Dzvj^H|&G@+lu%O?^g$Q8A8?- zqr{Jr%V(hEgR4!Qh?%G`$2{`j+ZqdcL_uZTEL(57bB>C?fyRO4V?8g`S0g{RKxih} z&`=yc&uS#P85fJ#iFTsF7hIQ77R7n==SiZlLr;wZo^(Byv@c!k7Vl>uMJJ%Fr=a2J zJ-!J3h4-oAVuZx|PHQ=W!+Q?!t2>J4v&L#B?}DI?6INS+W+bS)u${q9M>kE1kzJ-t z!mi~fYwsU0cAZPb>$U8*a+OMIaB_0luPV>c-v(j)iVZhM`H@Ws)h!wzl=%|{P7KU- zKYdk?0ZIZV6SWlt`GzO2oV#{425$fGZK5HNRzrru_tVEGtL~@gV?sCsaL37K2n%fa z<$WQ2ZepFEW@OA?KTdiOAKLL$h53tV%Y|-ceZmJ)8trjCGZo!~b1@lZ3Kp@6Zuw%> zRIkuCZ6KMfCoQSZa;HdGr}VC)zP^*pjl=Gcc{Dy}vZAE9Yq$U|NRF7IKP$CCE?P0$ zyz~eVb+>%ErQq3-thdZ^@nh{w6Z`Pe0mg@)P@rvNmktpTB~32Pr&v^k3yt-z{xRP;=m+6Lb>$hWj857C+6bt$gx+( z;ZD(V&BKw0rd-GL?9V+&DDtV@6UyEok|jnSq}YUX;41RN=g|dnBiWF+X+_`I<(d(Ml#|kg$=I6| zr`txIuk=Ok^zfymD~q!-TS-c%L_k25t|F~%bp(o}rtG$oalFY>Zn2(<%EvpDjBxd<)K?6FBu$KoQUMM>i#eKUf_71Z+* z`)jsJgo{`^PF!o?E>WKmg9wl+X^-J*Unj*-JQP`aSDMYyh-oju3urgvX?7dcWX*sN zXLsnmc#$1hc|ZAFyL@c!v250(lPq9Q%$wjIs2JBhOUpySrfY>1a_vv_ne|HCkw4v_ z`HXW%<8c}H4?^kI6reQPtJv+`wu}{wW(f??}|W zhPwm<=5AP-*Rtl5kY2>jzG=^Il~tH$ZZ|rerrDf)KHH3)3>)srz#huIa`#GRZEUhp zvJTvt5Z&4}sz@3krp5Rw4$rh|=r@@})x+wJa=Hsxo*Of{ga?~2S9;HM~f zQHOc46XFdMu9pw+$NRrs<)oSs^&HxcS17aaRlVuxIH9{Et7A=RuQtcxOu zL8Cs-9R^iwndCs72aghH$Su4W6y`|2IX@eCX=kM0p`DnQGpm}Yv6t}&?Lz%wd`^*uj*RPU7UF-Qxo67A6y zizr7gbK$imo0#manrK0Z zf~mIgG%T7fLA_3{o~niJ`M751p2hL98q*QJKa%>2QGVw&>(ha>by3FkQhaDIXru>j zl!!{qvH)R_)QrKmy)Cpwavd*mJz7Q zX>>FW{}j8sj;{daR71Pw1eRr z^U@TN)HLI@M9HCE9k>W~%tg)UPkny-D$|F@6-2oWxn__0wo6Iwc?4QrtO#kXB|v0% z2nheEVu$CNM--c&Q8kAazYc7tEI;uVuv&TLH4`&ra4zZnvjkI(7jws?7|ok-75<>} zUXkif7<}tBLOI`~Wj=)#Ien-CBXHqFD*fa`w)NYIhu@b-8SlNhOE;RMfm7!gNaI2> zh_RE6=?<@bXA)FU>aPZtmOM9D4zZ4_5-VTb>q@^2_Z2@MdSEZ=y@AvfNH8A-LDc{f zF`Cl}Y8S}uloPISc|sZ~=wR<56%h|>RBL?Tnb{{SlLr^qeL!C}4>4SkYziZomqgbi zdev+|v!JI$udfBNR=cB^bmEC0!@yhf#G>8WYYf|5KQ|Hny(XDoV1!@({Na=HTf^3W zC3gG;qpSa|gVrppVy>x#Y>O!0#Gn|&US-Xh_b9L@b3K|kskMlvCWzi-*m9Yg6db4b z!gk}FF~c^Rt*;N03r(Z#PNB0jFF2X=5Z?{EdHZ0l@G@*hQ+YF{>>LoZ`>G!jG}_Nh zhUY~z^Sj0+Bt7SN;2q_;He$gFp;8{UKom@Z5n+x=Wu zDBXp64gT&G{653C>g7g?nFq2Q5vF2}FJ>LIB%sqFmxw2)YS%`nqP}Zx>Y&HbpEObi|0lc^q2EIeKZ3|7AAChbF zGGPeUF7m<6=2ZwvSL1H)RL>z*KFY3!D3hbwh_eAeQFC?4rJD>KLnDkDFJ$5*rP?Hj6+rRXoU~e#hNQ*8_bxc_ zyos6>>D>QuP2`R(%Evn6nN-sKL>5>7ySD()(TOO_Ujk_CnSez+Gmw)3fw8NWFc-bz zdBxUKoSjR?mdQHvN;EoMNjK`F4%j)12_+-}a^aN82m;u`TQy@x z*w~wGq;tnp;t_~qX6;HAHUZ*|+?gNMMrUmzhL1KY>nW~KHQMu6_g?F`b@k43sC50u zC&Xc!P=pTPiEMvFFRha0RrAx1|W#u0k=g^*KTs`Bye!8#1 zgQjbfdZ2>h;pSW$Xu$IFIro0QXz00h3lelJG30He~uH2;W{ zhjgh;Uoyjzm*fEXfDdI->87yBufMArQ-(;8s``!5*TpG!!-=VW~kIiiQ zrlZ*9bwMcgj$<|yP0L!T$mj~iaa_B(UpK!xQYFdjWjNoiCl?jPS`P;U1Bk>r4|t{` zl^@!0WT@t04q>A`fMI9RP(Nxbo=F`V2iAN zt}QSXCqm&MHw!;$dmPpTy;I&MTv#2aRi;Ddt3HuS7@N$5Xwq3#Ahc|>lcgAXN+APF z3GRKiBcIB@!#JQ`n0hRAjYod>w6*Ys#>SicbCU2G6^706yw1umMQVQL`l@xrq9jA1 zUG{=b7r6R4frlGGph51AstH-Px-|pfCuc<52mv7%6or+9PQSZ3^u;bwZFkR_w#Bm@ z=x{&FK_tNv5r40X)4!Zhn#GL{d+%ld3QM82(AH3RGkK1wz?Wb$&0ZBX>MMjgR4Dy$ z=3y-UB<-);b|Zz$~A>IVnzNMAq5E`&ulousu*}aC#)0>SX>VH%0C6$97w{Z0KXO zyVuZSBdlV{=4_>Ji94L17#)rN%r(ufIh)K?d+SL|%Ea^U5RK(4zCg$$btmI&r^}4V zat39$yUhtI`b{Y)%CwS*@;vsDrD&$(`%d4M+p4v|UVW=WF^bVQWzeUy-mFP2C1OZ^ zZ~9Oc;t`I|9Nx*RK8@G-+Nb&{NzLg(=n1(KWGNRHV6lXx3+a-;ZRdXL2Z+rhBwbsF zgM5Y}Nyd;YX%Fct0cL}3#DL`WW*h!o-^)a|W%omK_V5dB@2>mDEQ$UZ?LPC6`p& zzJp<3o87e|rp)uw5EQ5~?m8E3o<=eRvJ6MU#eB{LjqM48J~;2v9at2M_-~@-A+K|!}(4XIeLk)-aRgMMXlx~4y)1M{5UQ`6z&9 z({uhFK=w4!zC$E`0I^V3S`vExxgoegF=N2)od;Fk=*WpJ2)R`f*$008S6ES_1ce z%FKkyxqzf%0C@7pMg1{Sf6S>rR^J~X;g8_?|D$aNmCWocE(5A30=k*~X!A z8o}qzue^0N%d>8vdQFdVfyu6rF`Pl@Goucytl}^6@P1zq`kVhTe4v^Gf~g}wlPM6; zm2h^0VJnDo1l`U~=;~buaPw?QPZ?}Wpl=I^Rh_1v0NP1Lbx0bZx;@3;O1U-H5c#v8 zoG|(&nwq)VilH#L!L}8Isf$2sDJjw8I>|(=xz7I;HVQN8iAM5NjM`&MdABhb*FGGlb8vPUX;wur+pXddk$4E zqz&A@G_%vTHCyR3&+FY?`@;5y-tu2E0sgUZBpovGgbk4q>DuL?l6<<# zC__R($B;eVDUSVJ0Ju|V7P1rq^QnV8LrD&CBHr#i0?UfLt z$o;YnM;-AX$^jIVTu2zjTMRa=jsU*c+X-lzLc17uS?{r5#?vGNAl%Mj( zCWR9?uP`=kpob<&Z)h2n31)*NL-OQfLwu)^&kYp>??IPv-8R_kB({r?E9P7mAx@g7 zAVJJaTZB`Dpsvmdon5{rN02oVH&en1{$ zmpqP`9}R8BCW!B-B-E!pw;5Yl%(GP-@s+!fXPL|uXVLZ`yP!#b6R@n8&>qO4?-05G zjUGW1X@D!DnNY@rRHl?-J&`M=g=q$P2iIQ=%D-RCtR3UKI0gxX#h_if$OkrC?HTC8 zR7>(Q-d^Tyb++xopoR)Nsdmw)=>_+Mc0>FZwbmjIj^p46R85v6px+|E+vg|uXz}sp zp6Qjm=!|9Zc$gVysB#;^R?ZzD$OyRzw5Qr|fFW}aXs1p#VK@U2Z6oy&&m7!qX538; zbGuYZ<@H!{n}jaMM|F5M>~s|E1~f)fviDR;91dE!8dZO|I`Afz)A~(bTUwaZm9^xY zy`l3{_J%u|Os@mGW|JNCP9O%04jq6JCh>!9upJBHyA$0w9&9gh3oSW1OHHxYWhkBX zldHLs3~7JIgaM=@pfuaGIs!(%!Xzd{_5!(Pn$30*BVg~)F6HkpoX}F-&qgvBkjOew z^zs15f{xxkosm$+6%7d4n9dr99}hZNO7;Y0HM7CaH)OXL&4ZyFNm@O|JL7|7yI4$$ z8?vDK+9^G*uf^nT^4bAl#Ff1}FutS@qDq4mw%>g>hGs6)| z-#wsUvr1uZVeRmg+6y_qWhTc7VLEjrl{F`&DndsL-Don@G<^jpyqA`Xc3+LDa!@6v z-p^67W!F$rq3mH|ZMx}=xm(M7DnSSRDAn;U!zcPMtQ?VubM>^EirftU7R>Y>vCE_L zxjW?LlgYX&!rX>^b!%wbf6ndozNgBgIr zE)kfQaQTSDrH)cPT_+O=SG?KC^Bj*sTS|_+ZZHQ>XqJ@{zvkJxYw0b_M zobh(r%gHi*y|ZfItYCJ#L!cBS5K85%c|U1JPz#qD=!+WO3gYX;_K~R%=x)Ez8?A<77Ez~jRK`L zwM42I#S?Xk_M}qV_IO$oT%j9RE?>Mi%MmS#n-_hbz!|T&E%m4@LW-dZB8<^7s?Z4^y2XNweclbDK1F&M@0jvNjpJcsVK9KGgouBtZxflGwv~f9hMy_w0Lry4K78d)GSa-o5wP z`#azFoyG2p(OPMbeEQZrN>bmQG4?djhSsrg{^})L=p5_KI}Z3X%HmVo@`U|dCzOM5 z=e(ppZ;ZKNz#x;Y9@MUS(}WTrAfb-WW^*=LQ{RLxc>&izbic%9Nk_6@Fip;AaJR0z zCwNK9Z@8bl#m%iblbvsqanx5d;VlUx9v@dHaEixkUbffOkV(brvTfqzVBz!ZTe0O4 zt80hsvU5*g-IE?(_k9s%7I!MFZ((yN;`VXURltH{S}DZf5cu#=(GwkP%@2PKB zFk)hH(PypjhXn7K$hE!UY37abE=|3alZ83j?zRZ+i)JfTSFPr(&u%9=QAKv*Rb2Am z495IMG6^($VtQVGhH=p#^Zm2Cp;|==Q|s z8Lc;hEZX0xhMBG`h^qidpA#`<#G7ygBQ|{;SejS+x)rBTQD9mH9aX=0wC2aSoiJ(D zVddY3@`j0so-W9r&8W;qj+{)YX!3d@ZP=$c;yVdB{hI6%ZGU^saLl3w*WI|O?7P=0 zqXuJ2Z2s}#u`B07AW7Nu8~lA*u}LbMa$G%b9VZ9xONcpY;}luCi4wlzVdbVm+M|2b zwXlSF-R!=^(X0Kilg6gn2FIQsfOnKHR5Vv2sij`YBCYs|T%N-Fs0mU>FpuKs7_8XT zmpE;`8#mSliLckYjp<3UHKE^DJX2Z9&w5fe#zc)bti7}MS<*FTWi#FF6SyJDd-)5d!l7~mg)P5rLBaX zJ|PfeD-{qVnE=|`u~8HTn9{en_A)r&`oCWX0Ki1_n}@)@#{WUQL>thkXN>_$I7JxD zlbxZG#>KYP07>)!NMaAzEH+EwxU~#HbR@tVe+GD?e0B0~CyzUMR>&)gyc)~*iod+i zCZiPqakLN^zROlfzJ7`_PqD`dDc!W|=T66O_>~Z6eE&obU^XA1yP}OMT(yp8H=4ls zM?PwiK268^L=&P4dA@Ak|Ik`UkP?#+(#rnn zx#*$7{neiJv?+$*UXwb3?JAg&tv(EtWb^160pXnpeG+9Q_2O<^TyYC99XuuJGEL1^ z@}kL@9%yFH<9K^TR}r$*j+tGuQXxVL{RX)38OwHkfUG( zN0rcpk`IvmDQ9Iwz!KoB`vB1gSQ}IDt1QbNi0wl)#cXLAI8866E%gw}8+LzyIDbPB ztR!3@v~tBjI^v?!#t>VdlSIjSTYze2KstqfHf8`!l*|}n1xDvvG^bd*JCBPPs^%f7 z*Z6+@qWx2W{`m><`nmQ)G}AUUH5EfX9FI)$)1dVh*(Xl_Hn(?P`E0-zS6o{_>^4ia z^}!40^Dj5L+a*ehSZwWCR_*8iE*mGzMA54DRJ~^no4Kd9>g{#!n+~gPy!*?tJpPuc z1mWAs-(I5T&!M*9l<+WthXf^n@k3p*_(3-f7>zM`F!wZmMiYO7Vr{m)4e|xiap*S$@fJl9jLx~!Ull*2$&?ZV2Oat`ni>RkVyUIhZ3=h7I zjDBlrtPjDuP7;-2C6AC5?vS~xkCyGUbg{)}rCV1O?2fw{VeAeZTfJLftEYj0PpyDd-gWD9- z`=4zy{mMHb`t%RnGD%wO2gqw4xX8fH7Nu`1IdLK(aw7BcXqWBzoC0M9Qzy`T^Y*)A z1BonUvA0Ham9#w5tP z0alMi@Z>9|2{9%8Yr23;s<|T7Ni;uh;7_t1ayCO?AK1Uzqf@@&hVu1(htIM*`?qHg z|Kric-|2Oz;&gr~B|M=FH%tM@CuTL>o(^jfOCvk%I@jGg>72_{T9>*3mFPLHdg)UhG_jqp_6DU?nfBlA$* zMemw`z^Il^Kf{~KmmzPX{*@Q|$wK_+QTOu;!+(5I?MR4{L_B7JKN0clD<8@DmyRm0 z=IzRMOg@Ju6)DLUn1K=1$^~)L&Ih5fUrqv=ZlHTt+>0Bjn)w@YqNzlZcj< zFek$JCX&00Z8Ju9N>ira%Sp$~E-)=;^=LV14&1DLKFQx66sD4sE2rWPg@vx0+55g* zNzv?>Z~lPx$*^@n(X|CniAa;RI>>9zb{3-}Y932%SOa#_rFpXILmdixlV`I2F>ef3 zg=JW3{*6P|wqF# Date: Sat, 12 Mar 2022 11:34:43 +0200 Subject: [PATCH 31/55] Move RFC doc here. From https://github.com/zenithdb/rfcs/pull/17 --- docs/rfcs/014-storage-lsm.md | 146 +++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 docs/rfcs/014-storage-lsm.md diff --git a/docs/rfcs/014-storage-lsm.md b/docs/rfcs/014-storage-lsm.md new file mode 100644 index 0000000000..f91ccda6c0 --- /dev/null +++ b/docs/rfcs/014-storage-lsm.md @@ -0,0 +1,146 @@ +# Why LSM trees? + +In general, an LSM tree has the nice property that random updates are +fast, but the disk writes are sequential. When a new file is created, +it is immutable. New files are created and old ones are deleted, but +existing files are never modified. That fits well with storing the +files on S3. + +Currently, we create a lot of small files. That is mostly a problem +with S3, because each GET/PUT operation is expensive. Currently, the +files "archived" together into larger checkpoint files before they're +uploaded to S3, but garbage collecting data from the archive files +would be difficult and we have not implemented it. This proposal +addresses that problem. + + +# Overview + + +``` +^ LSN +| +| Memtable: +-----------------------------+ +| | | +| +-----------------------------+ +| +| +| L0: +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +-----------------------------+ +| | | +| +-----------------------------+ +| +| +| L1: +-------+ +-----+ +--+ +-+ +| | | | | | | | | +| | | | | | | | | +| +-------+ +-----+ +--+ +-+ +| +| +----+ +-----+ +--+ +----+ +| | | | | | | | | +| | | | | | | | | +| +----+ +-----+ +--+ +----+ +| ++--------------------------------------------------------------> Page ID + + ++---+ +| | Layer file ++---+ +``` + + +# Memtable + +When new WAL arrives, it is first put into the Memtable. Despite the +name, the Memtable is not a purely in-memory data structure. It can +spill to a temporary file on disk if the system is low on memory, and +is accessed through a buffer cache. + +If the page server crashes, the Memtable is lost. It is rebuilt by +processing again the WAL that's newer than the latest layer in L0. + +The size of the Memtable is equal to the "checkpoint distance", or the +amount of WAL that we need to keep in the safekeeper. + +# L0 + +When the Memtable fills up, it is written out to a new file in L0. The +files are immutable; when a file is created, it is never +modified. Each file in L0 is roughly 1 GB in size (*). Like the +Memtable, each file in L0 covers the whole key range. + +When enough files have been accumulated in L0, compaction +starts. Compaction processes all the files in L0 and reshuffles the +data to create a new set of files in L1. + + +(*) except in corner cases like if we want to shut down the page +server and want to flush out the memtable to disk even though it's not +full yet. + + +# L1 + +L1 consists of ~ 1 GB files like L0. But each file covers only part of +the overall key space, and a larger range of LSNs. This speeds up +searches. When you're looking for a given page, you need to check all +the files in L0, to see if they contain a page version for the requested +page. But in L1, you only need to check the files whose key range covers +the requested page. + +Partitioning by key range also helps with garbage collection. If only a +part of the database is updated, we will accumulate more files for +the hot part in L1, and old files can be removed without affecting the +cold part. + + +# Image layers + +So far, we've only talked about delta layers. In addition to the delta +layers, we create image layers, when "enough" WAL has been accumulated +for some part of the database. Each image layer covers a 1 GB range of +key space. It contains images of the pages at a single LSN, a snapshot +if you will. + +The exact heuristic for what "enough" means is not clear yet. Maybe +create a new image layer when 10 GB of WAL has been accumulated for a +1 GB segment. + +The image layers limit the number of layers that a search needs to +check. That put a cap on read latency, and it also allows garbage +collecting layers that are older than the GC horizon. + + +# Partitioning scheme + +When compaction happens and creates a new set of files in L1, how do +we partition the data into the files? + +- Goal is that each file is ~ 1 GB in size +- Try to match partition boundaries at relation boundaries. (See [1] + for how PebblesDB does this, and for why that's important) +- Greedy algorithm + +# Next steps + +- Allow delta layers to cover a range keys instead of a single segment. + +- Implement a two-level LSM tree (or three-leveled, if you count the +"memtable"), by adding L0. + +# Additional Reading + +[1] Paper on PebblesDB and how it does partitioning. +https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf From 64cdd6064d3962aa8243506e50dd9797b86d3705 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 13 Mar 2022 01:14:58 +0200 Subject: [PATCH 32/55] Don't ClearVisibilityMapFlags records for non-existent blocks. We create a ClearVisibilityMapFlags record for the VM page, when a heap WAL record indicates that the VM bit needs to be cleared. However, sometimes the VM block would not exist. It seems that PostgreSQL sometimes sets the clear-VM bit on WAL records, even though the corresponding VM page hasn't been initialized yet. There's no point in trying to clear a bit on a non-existent bit, so just skip emitting the record if the VM page doesn't exist. I'm not entirely sure why we're only seeing this bug with this PR, I think it existed before. Maybe we were more sloppy and returned an all-zeros page? --- pageserver/src/walingest.rs | 101 ++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 32 deletions(-) diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 3d32410f41..9b2ae4f101 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -391,47 +391,69 @@ impl<'a, R: Repository> WalIngest<'a, R> { relnode: decoded.blocks[0].rnode_relnode, }; - let new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - let old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); - if new_vm_blk == old_vm_blk { - // An UPDATE record that needs to clear the bits for both old and the - // new page, both of which reside on the same VM page. - self.put_rel_wal_record( - timeline, - vm_rel, - new_vm_blk.unwrap(), - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno, - old_heap_blkno, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; - } else { - // Clear VM bits for one heap page, or for two pages that reside on - // different VM pages. - if let Some(new_vm_blk) = new_vm_blk { + let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK); + + // Sometimes, Postgres seems to create heap WAL records with the + // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is + // not set. In fact, it's possible that the VM page does not exist at all. + // In that case, we don't want to store a record to clear the VM bit; + // replaying it would fail to find the previous image of the page, because + // it doesn't exist. So check if the VM page(s) exist, and skip the WAL + // record if it doesn't. + let vm_size = self.get_relsize(vm_rel)?; + if let Some(blknum) = new_vm_blk { + if blknum >= vm_size { + new_vm_blk = None; + } + } + if let Some(blknum) = old_vm_blk { + if blknum >= vm_size { + old_vm_blk = None; + } + } + + if new_vm_blk.is_some() || old_vm_blk.is_some() { + if new_vm_blk == old_vm_blk { + // An UPDATE record that needs to clear the bits for both old and the + // new page, both of which reside on the same VM page. self.put_rel_wal_record( timeline, vm_rel, - new_vm_blk, + new_vm_blk.unwrap(), ZenithWalRecord::ClearVisibilityMapFlags { new_heap_blkno, - old_heap_blkno: None, - flags: pg_constants::VISIBILITYMAP_VALID_BITS, - }, - )?; - } - if let Some(old_vm_blk) = old_vm_blk { - self.put_rel_wal_record( - timeline, - vm_rel, - old_vm_blk, - ZenithWalRecord::ClearVisibilityMapFlags { - new_heap_blkno: None, old_heap_blkno, flags: pg_constants::VISIBILITYMAP_VALID_BITS, }, )?; + } else { + // Clear VM bits for one heap page, or for two pages that reside on + // different VM pages. + if let Some(new_vm_blk) = new_vm_blk { + self.put_rel_wal_record( + timeline, + vm_rel, + new_vm_blk, + ZenithWalRecord::ClearVisibilityMapFlags { + new_heap_blkno, + old_heap_blkno: None, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } + if let Some(old_vm_blk) = old_vm_blk { + self.put_rel_wal_record( + timeline, + vm_rel, + old_vm_blk, + ZenithWalRecord::ClearVisibilityMapFlags { + new_heap_blkno: None, + old_heap_blkno, + flags: pg_constants::VISIBILITYMAP_VALID_BITS, + }, + )?; + } } } } @@ -882,6 +904,21 @@ impl<'a, R: Repository> WalIngest<'a, R> { Ok(()) } + fn get_relsize(&mut self, rel: RelTag) -> Result { + if let Some(nblocks) = self.relsize_cache.get(&rel) { + Ok(*nblocks) + } else { + let last_lsn = self.timeline.get_last_record_lsn(); + let nblocks = if !self.timeline.get_rel_exists(rel, last_lsn)? { + 0 + } else { + self.timeline.get_rel_size(rel, last_lsn)? + }; + self.relsize_cache.insert(rel, nblocks); + Ok(nblocks) + } + } + fn handle_rel_extend( &mut self, writer: &mut DatadirTimelineWriter, From f06707badcb29f61ecc0f4bd0d48c430616cca67 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Sun, 13 Mar 2022 01:15:32 +0200 Subject: [PATCH 33/55] Bugfix: a few constant keys were missing from collect_keyspace As a result, you got "could not find data for key" errors. --- pageserver/src/pgdatadir_mapping.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 43876760a3..3337b2e6d4 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -376,6 +376,7 @@ impl DatadirTimeline { SlruKind::MultiXactOffsets, ] { let slrudir_key = slru_dir_to_key(kind); + result.add_key(slrudir_key); let buf = self.tline.get(slrudir_key, lsn)?; let dir = SlruSegmentDirectory::des(&buf)?; let mut segments: Vec = dir.segments.iter().cloned().collect(); @@ -393,6 +394,7 @@ impl DatadirTimeline { } // Then pg_twophase + result.add_key(TWOPHASEDIR_KEY); let buf = self.tline.get(TWOPHASEDIR_KEY, lsn)?; let twophase_dir = TwoPhaseDirectory::des(&buf)?; let mut xids: Vec = twophase_dir.xids.iter().cloned().collect(); From 2d8587f67d22ff66e14024eeb6912d1c633df0a2 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 11:37:22 +0200 Subject: [PATCH 34/55] Separate flushing in-memory layer to disk from checkpoints. When 'checkpoint_distance' is reached, freeze the current in-memory layer directly in the WAL receiver thread. And to flush the frozen layer to disk, launch a separate "layer flushing thread". This leaves only the compaction duty to the checkpoint thread. --- pageserver/src/layered_repository.rs | 243 +++++++++++------- .../src/layered_repository/inmemory_layer.rs | 22 -- .../src/layered_repository/layer_map.rs | 3 +- pageserver/src/lib.rs | 2 +- pageserver/src/pgdatadir_mapping.rs | 2 +- pageserver/src/repository.rs | 31 ++- pageserver/src/tenant_threads.rs | 2 +- pageserver/src/thread_mgr.rs | 3 + pageserver/src/walreceiver.rs | 2 + 9 files changed, 175 insertions(+), 135 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index cc8ecd9275..f195288b9a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -29,7 +29,7 @@ use std::io::Write; use std::ops::{Bound::Included, Deref, Range}; use std::path::{Path, PathBuf}; use std::sync::atomic::{self, AtomicBool}; -use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard}; +use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError}; use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; @@ -720,6 +720,8 @@ pub struct LayeredTimeline { layers: Mutex, + last_freeze_at: AtomicLsn, + // WAL redo manager walredo_mgr: Arc, @@ -768,6 +770,9 @@ pub struct LayeredTimeline { /// to avoid deadlock. write_lock: Mutex<()>, + /// Used to ensure that there is only one thread + layer_flush_lock: Mutex<()>, + // Prevent concurrent checkpoints. // Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin // (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread @@ -854,15 +859,24 @@ impl Timeline for LayeredTimeline { /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { match cconf { - CheckpointConfig::Flush => self - .flush_checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => self - .forced_checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => self + CheckpointConfig::Flush => { + self.flush_checkpoint_time_histo + .observe_closure_duration(|| { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) + }) + } + CheckpointConfig::Forced => { + self.forced_checkpoint_time_histo + .observe_closure_duration(|| { + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.checkpoint_internal() + }) + } + CheckpointConfig::Distance => self .checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal(distance, true)), + .observe_closure_duration(|| self.checkpoint_internal()), } } @@ -969,6 +983,8 @@ impl LayeredTimeline { }), disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0), + last_freeze_at: AtomicLsn::new(0), + ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), @@ -980,6 +996,7 @@ impl LayeredTimeline { upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), + layer_flush_lock: Mutex::new(()), checkpoint_cs: Mutex::new(()), gc_info: RwLock::new(GcInfo { @@ -1100,7 +1117,7 @@ impl LayeredTimeline { let mut result = ValueReconstructResult::Continue; let mut cont_lsn = Lsn(request_lsn.0 + 1); - loop { + 'outer: loop { // The function should have updated 'state' //info!("CALLED for {} at {}: {:?} with {} records", reconstruct_state.key, reconstruct_state.lsn, result, reconstruct_state.records.len()); match result { @@ -1169,7 +1186,7 @@ impl LayeredTimeline { continue; } } - if let Some(frozen_layer) = &layers.frozen_layer { + for frozen_layer in layers.frozen_layers.iter() { let start_lsn = frozen_layer.get_lsn_range().start; if cont_lsn > start_lsn { //info!("CHECKING for {} at {} on frozen layer {}", key, cont_lsn, frozen_layer.filename().display()); @@ -1180,7 +1197,7 @@ impl LayeredTimeline { )?; cont_lsn = start_lsn; path.push((result, cont_lsn, frozen_layer.clone())); - continue; + continue 'outer; } } @@ -1258,7 +1275,7 @@ impl LayeredTimeline { lsn ); let new_layer = - InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn, lsn)?; + InMemoryLayer::create(self.conf, self.timelineid, self.tenantid, start_lsn)?; let layer_rc = Arc::new(new_layer); layers.open_layer = Some(Arc::clone(&layer_rc)); @@ -1273,7 +1290,6 @@ impl LayeredTimeline { //info!("PUT: key {} at {}", key, lsn); let layer = self.get_layer_for_write(lsn)?; layer.put_value(key, lsn, val)?; - Ok(()) } @@ -1284,80 +1300,53 @@ impl LayeredTimeline { Ok(()) } + fn finish_write(&self, new_lsn: Lsn) { + assert!(new_lsn.is_aligned()); + + self.last_record_lsn.advance(new_lsn); + } + + fn freeze_inmem_layer(&self, write_lock_held: bool) { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let _write_guard = if write_lock_held { + None + } else { + Some(self.write_lock.lock().unwrap()) + }; + let mut layers = self.layers.lock().unwrap(); + if let Some(open_layer) = &layers.open_layer { + let open_layer_rc = Arc::clone(open_layer); + // Does this layer need freezing? + let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); + open_layer.freeze(end_lsn); + + // The layer is no longer open, update the layer map to reflect this. + // We will replace it with on-disk historics below. + layers.frozen_layers.push_back(open_layer_rc); + layers.open_layer = None; + layers.next_open_layer_at = Some(end_lsn); + self.last_freeze_at.store(end_lsn); + } + drop(layers); + } + /// /// Flush to disk all data that was written with the put_* functions /// /// NOTE: This has nothing to do with checkpoint in PostgreSQL. - fn checkpoint_internal(&self, checkpoint_distance: u64, reconstruct_pages: bool) -> Result<()> { + fn checkpoint_internal(&self) -> Result<()> { info!("checkpoint starting"); // Prevent concurrent checkpoints + // FIXME: This does compaction now, not the flushing of layers. + // Is this lock still needed? let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - // If the in-memory layer is larger than 'checkpoint_distance', write it - // to a delta file. That's necessary to limit the amount of WAL that - // needs to be kept in the safekeepers, and that needs to be reprocessed - // on page server crash. - // - // TODO: It's not a great policy for keeping memory usage in check, - // though. We should also aim at flushing layers that consume a lot of - // memory and/or aren't receiving much updates anymore. - loop { - // Do we have a frozen in-memory layer that we need to write out? - // If we do, write it out now. Otherwise, check if the current - // in-memory layer is old enough that we should freeze and write it out. - let write_guard = self.write_lock.lock().unwrap(); - let mut layers = self.layers.lock().unwrap(); - if let Some(frozen_layer) = &layers.frozen_layer { - // Write out the frozen in-memory layer to disk, as a delta file - let frozen_layer = Arc::clone(frozen_layer); - drop(write_guard); - drop(layers); - self.flush_frozen_layer(frozen_layer)?; - } else { - // Freeze the current open in-memory layer, if it's larger than - // 'checkpoint_distance'. It will be written to disk on next - // iteration. - if let Some(open_layer) = &layers.open_layer { - // Does this layer need freezing? - let RecordLsn { - last: last_record_lsn, - prev: _prev_record_lsn, - } = self.last_record_lsn.load(); - let oldest_lsn = open_layer.get_oldest_lsn(); - let distance = last_record_lsn.widening_sub(oldest_lsn); - if distance < 0 || distance < checkpoint_distance.into() { - info!( - "the oldest layer is now {} which is {} bytes behind last_record_lsn", - open_layer.filename().display(), - distance - ); - break; - } - let end_lsn = Lsn(self.get_last_record_lsn().0 + 1); - open_layer.freeze(end_lsn); - - // The layer is no longer open, update the layer map to reflect this. - // We will replace it with on-disk historics below. - layers.frozen_layer = Some(Arc::clone(open_layer)); - layers.open_layer = None; - layers.next_open_layer_at = Some(end_lsn); - } else { - break; - } - // We will write the now-frozen layer to disk on next iteration. - // That could take a while, so release the lock while do it - drop(layers); - drop(write_guard); - } - } - // Create new image layers to allow GC and to reduce read latency - if reconstruct_pages { - // TODO: the threshold for how often we create image layers is - // currently hard-coded at 3. It means, write out a new image layer, - // if there are at least three delta layers on top of it. - self.compact(TARGET_FILE_SIZE_BYTES as usize)?; - } + // TODO: the threshold for how often we create image layers is + // currently hard-coded at 3. It means, write out a new image layer, + // if there are at least three delta layers on top of it. + self.compact(TARGET_FILE_SIZE_BYTES as usize)?; // TODO: We should also compact existing delta layers here. @@ -1373,20 +1362,86 @@ impl LayeredTimeline { Ok(()) } + pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { + let last_lsn = self.get_last_record_lsn(); + + let distance = last_lsn.widening_sub(self.last_freeze_at.load()); + if distance >= self.conf.checkpoint_distance.into() { + self.freeze_inmem_layer(true); + self.last_freeze_at.store(last_lsn); + } + if let Ok(guard) = self.layer_flush_lock.try_lock() { + drop(guard); + let self_clone = Arc::clone(self); + thread_mgr::spawn( + thread_mgr::ThreadKind::LayerFlushThread, + Some(self.tenantid), + Some(self.timelineid), + "layer flush thread", + move || self_clone.flush_frozen_layers(false), + )?; + } + Ok(()) + } + + /// Flush all frozen layers to disk. + /// + /// Only one thread at a time can be doing layer-flushing for a + /// given timeline. If 'wait' is true, and another thread is + /// currently doing the flushing, this function will wait for it + /// to finish. If 'wait' is false, this function will return + /// immediately instead. + fn flush_frozen_layers(&self, wait: bool) -> Result<()> { + let flush_lock_guard = if wait { + self.layer_flush_lock.lock().unwrap() + } else { + match self.layer_flush_lock.try_lock() { + Ok(guard) => guard, + Err(TryLockError::WouldBlock) => return Ok(()), + Err(TryLockError::Poisoned(err)) => panic!("{:?}", err), + } + }; + + loop { + let layers = self.layers.lock().unwrap(); + if let Some(frozen_layer) = layers.frozen_layers.front() { + let frozen_layer = Arc::clone(frozen_layer); + drop(layers); // to allow concurrent reads and writes + self.flush_frozen_layer(frozen_layer)?; + } else { + // Drop the 'layer_flush_lock' *before* 'layers'. That + // way, if you freeze a layer, and then call + // flush_frozen_layers(false), it is guaranteed that + // if another thread was busy flushing layers and the + // call therefore returns immediately, the other + // thread will have seen the newly-frozen layer and + // will flush that too (assuming no errors). + drop(flush_lock_guard); + drop(layers); + break; + } + } + + Ok(()) + } + fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { // Do we have a frozen in-memory layer that we need to write out? let new_delta = frozen_layer.write_to_disk()?; // Finally, replace the frozen in-memory layer with the new on-disk layers - let write_guard = self.write_lock.lock().unwrap(); let mut layers = self.layers.lock().unwrap(); - layers.frozen_layer = None; + let l = layers.frozen_layers.pop_front(); + + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); // Add the new delta layer to the LayerMap let mut layer_paths = vec![new_delta.path()]; layers.insert_historic(Arc::new(new_delta)); - drop(write_guard); drop(layers); // Sync layers @@ -1929,10 +1984,8 @@ impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> { /// /// Remember the (end of) last valid WAL record remembered in the timeline. /// - fn advance_last_record_lsn(&self, new_lsn: Lsn) { - assert!(new_lsn.is_aligned()); - - self.tl.last_record_lsn.advance(new_lsn); + fn finish_write(&self, new_lsn: Lsn) { + self.tl.finish_write(new_lsn); } } @@ -2031,7 +2084,7 @@ mod tests { let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; - writer.advance_last_record_lsn(Lsn(0x10)); + writer.finish_write(Lsn(0x10)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; @@ -2039,7 +2092,7 @@ mod tests { let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; - writer.advance_last_record_lsn(Lsn(0x20)); + writer.finish_write(Lsn(0x20)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; @@ -2047,7 +2100,7 @@ mod tests { let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; - writer.advance_last_record_lsn(Lsn(0x30)); + writer.finish_write(Lsn(0x30)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; @@ -2055,7 +2108,7 @@ mod tests { let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; - writer.advance_last_record_lsn(Lsn(0x40)); + writer.finish_write(Lsn(0x40)); drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; @@ -2094,7 +2147,7 @@ mod tests { lsn, Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); drop(writer); keyspace.add_key(test_key); @@ -2145,7 +2198,7 @@ mod tests { lsn, Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); updated[blknum] = lsn; drop(writer); @@ -2167,7 +2220,7 @@ mod tests { Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; println!("updating {} at {}", blknum, lsn); - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; } @@ -2219,7 +2272,7 @@ mod tests { lsn, Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); updated[blknum] = lsn; drop(writer); @@ -2253,7 +2306,7 @@ mod tests { Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))), )?; println!("updating {} at {}", blknum, lsn); - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); drop(writer); updated[blknum] = lsn; } diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index c623630851..577e562115 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -34,21 +34,6 @@ pub struct InMemoryLayer { /// start_lsn: Lsn, - /// - /// LSN of the oldest value stored in this layer. - /// - /// This is different from 'start_lsn' in that we enforce that the 'start_lsn' - /// of a layer always matches the 'end_lsn' of its predecessor, even if there - /// are no page versions until at a later LSN. That way you can detect any - /// missing layer files more easily. 'oldest_lsn' is the first page version - /// actually stored in this layer. In the range between 'start_lsn' and - /// 'oldest_lsn', there are no changes to the segment. - /// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should - /// point to the beginning of WAL record. This is the other difference with 'start_lsn' - /// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'. - /// - oldest_lsn: Lsn, - /// The above fields never change. The parts that do change are in 'inner', /// and protected by mutex. inner: RwLock, @@ -236,11 +221,6 @@ impl Layer for InMemoryLayer { } impl InMemoryLayer { - /// Return the oldest page version that's stored in this layer - pub fn get_oldest_lsn(&self) -> Lsn { - self.oldest_lsn - } - /// /// Create a new, empty, in-memory layer /// @@ -249,7 +229,6 @@ impl InMemoryLayer { timelineid: ZTimelineId, tenantid: ZTenantId, start_lsn: Lsn, - oldest_lsn: Lsn, ) -> Result { trace!( "initializing new empty InMemoryLayer for writing on timeline {} at {}", @@ -264,7 +243,6 @@ impl InMemoryLayer { timelineid, tenantid, start_lsn, - oldest_lsn, inner: RwLock::new(InMemoryLayerInner { end_lsn: None, index: HashMap::new(), diff --git a/pageserver/src/layered_repository/layer_map.rs b/pageserver/src/layered_repository/layer_map.rs index 27a3eb279a..4b0d950414 100644 --- a/pageserver/src/layered_repository/layer_map.rs +++ b/pageserver/src/layered_repository/layer_map.rs @@ -16,6 +16,7 @@ use crate::layered_repository::InMemoryLayer; use crate::repository::Key; use anyhow::Result; use lazy_static::lazy_static; +use std::collections::VecDeque; use std::ops::Range; use std::sync::Arc; use tracing::*; @@ -47,7 +48,7 @@ pub struct LayerMap { /// layer is during checkpointing, when an InMemoryLayer is being written out /// to disk. /// - pub frozen_layer: Option>, + pub frozen_layers: VecDeque>, /// All the historic layers are kept here diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 904b1d3819..3c557e4e82 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -42,7 +42,7 @@ pub const LOG_FILE_NAME: &str = "pageserver.log"; #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { // Flush in-memory data that is older than this - Distance(u64), + Distance, // Flush all in-memory data Flush, // Flush all in-memory data and reconstruct all page images diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 3337b2e6d4..9be8e658ca 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -798,7 +798,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { writer.delete(key_range.clone(), self.lsn)?; } - writer.advance_last_record_lsn(self.lsn); + writer.finish_write(self.lsn); if last_partitioning == Lsn(0) || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 322465c2a7..03fbff42a8 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -401,11 +401,14 @@ pub trait TimelineWriter<'a> { fn delete(&self, key_range: Range, lsn: Lsn) -> Result<()>; - /// Track end of the latest digested WAL record. + /// Track the end of the latest digested WAL record. /// - /// Advance requires aligned LSN as an argument and would wake wait_lsn() callers. - /// Previous last record LSN is stored alongside the latest and can be read. - fn advance_last_record_lsn(&self, lsn: Lsn); + /// Call this after you have finished writing all the WAL up to 'lsn'. + /// + /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for + /// the 'lsn' or anything older. The previous last record LSN is stored alongside + /// the latest and can be read. + fn finish_write(&self, lsn: Lsn); } #[cfg(test)] @@ -554,12 +557,12 @@ mod tests { let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?; - writer.advance_last_record_lsn(Lsn(0x10)); + writer.finish_write(Lsn(0x10)); drop(writer); let writer = tline.writer(); writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; - writer.advance_last_record_lsn(Lsn(0x20)); + writer.finish_write(Lsn(0x20)); drop(writer); assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); @@ -594,12 +597,12 @@ mod tests { // Insert a value on the timeline writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?; writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?; - writer.advance_last_record_lsn(Lsn(0x20)); + writer.finish_write(Lsn(0x20)); writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?; - writer.advance_last_record_lsn(Lsn(0x30)); + writer.finish_write(Lsn(0x30)); writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?; - writer.advance_last_record_lsn(Lsn(0x40)); + writer.finish_write(Lsn(0x40)); //assert_current_logical_size(&tline, Lsn(0x40)); @@ -611,7 +614,7 @@ mod tests { }; let new_writer = newtline.writer(); new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?; - new_writer.advance_last_record_lsn(Lsn(0x40)); + new_writer.finish_write(Lsn(0x40)); // Check page contents on both branches assert_eq!( @@ -643,14 +646,14 @@ mod tests { lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); lsn += 0x10; } tline.checkpoint(CheckpointConfig::Forced)?; @@ -661,14 +664,14 @@ mod tests { lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); lsn += 0x10; writer.put( *TEST_KEY, lsn, Value::Image(TEST_IMG(&format!("foo at {}", lsn))), )?; - writer.advance_last_record_lsn(lsn); + writer.finish_write(lsn); } tline.checkpoint(CheckpointConfig::Forced) } diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index a6711f0542..c7fe625ecf 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -34,7 +34,7 @@ fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Re // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE // bytes of WAL since last checkpoint. let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?; + repo.checkpoint_iteration(CheckpointConfig::Distance)?; } trace!( diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index a51f0909ca..ec3606b41e 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -98,6 +98,9 @@ pub enum ThreadKind { // Thread that handles GC of a tenant GarbageCollector, + // FIXME + LayerFlushThread, + // Thread for synchronizing pageserver relish data with the remote storage. // Shared by all tenants. StorageSync, diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index fd318b9cb7..993768fbac 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -266,6 +266,8 @@ fn walreceiver_main( caught_up = true; } + timeline.tline.check_checkpoint_distance()?; + Some(endlsn) } From 09f2dff5370029ad421b9bd61c47d2ad5df43bd9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 13:22:46 +0200 Subject: [PATCH 35/55] Refactor the checkpoint and compaction functions. The concept of a "checkpoint" had become quite muddled. This tries to clarify it again. --- pageserver/src/config.rs | 37 +++--- pageserver/src/layered_repository.rs | 171 ++++++++++++++------------- pageserver/src/lib.rs | 2 - pageserver/src/page_service.rs | 7 ++ pageserver/src/pgdatadir_mapping.rs | 7 +- pageserver/src/repository.rs | 11 +- pageserver/src/tenant_mgr.rs | 21 ++-- pageserver/src/tenant_threads.rs | 24 ++-- pageserver/src/thread_mgr.rs | 4 +- pageserver/src/timelines.rs | 2 +- 10 files changed, 158 insertions(+), 128 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index dc85c83c17..0bfc451a24 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -31,7 +31,8 @@ pub mod defaults { // would be more appropriate. But a low value forces the code to be exercised more, // which is good for now to trigger bugs. pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s"; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s"; pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; pub const DEFAULT_GC_PERIOD: &str = "100 s"; @@ -57,7 +58,7 @@ pub mod defaults { #listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}' +#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' #gc_period = '{DEFAULT_GC_PERIOD}' #gc_horizon = {DEFAULT_GC_HORIZON} @@ -91,7 +92,9 @@ pub struct PageServerConf { // This puts a backstop on how much WAL needs to be re-digested if the // page server crashes. pub checkpoint_distance: u64, - pub checkpoint_period: Duration, + + // How often to check if there's compaction work to be done. + pub compaction_period: Duration, pub gc_horizon: u64, pub gc_period: Duration, @@ -145,7 +148,8 @@ struct PageServerConfigBuilder { listen_http_addr: BuilderValue, checkpoint_distance: BuilderValue, - checkpoint_period: BuilderValue, + + compaction_period: BuilderValue, gc_horizon: BuilderValue, gc_period: BuilderValue, @@ -179,8 +183,8 @@ impl Default for PageServerConfigBuilder { listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), - checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD) - .expect("cannot parse default checkpoint period")), + compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period")), gc_horizon: Set(DEFAULT_GC_HORIZON), gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) .expect("cannot parse default gc period")), @@ -216,8 +220,8 @@ impl PageServerConfigBuilder { self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) } - pub fn checkpoint_period(&mut self, checkpoint_period: Duration) { - self.checkpoint_period = BuilderValue::Set(checkpoint_period) + pub fn compaction_period(&mut self, compaction_period: Duration) { + self.compaction_period = BuilderValue::Set(compaction_period) } pub fn gc_horizon(&mut self, gc_horizon: u64) { @@ -286,9 +290,9 @@ impl PageServerConfigBuilder { checkpoint_distance: self .checkpoint_distance .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, - checkpoint_period: self - .checkpoint_period - .ok_or(anyhow::anyhow!("missing checkpoint_period"))?, + compaction_period: self + .compaction_period + .ok_or(anyhow::anyhow!("missing compaction_period"))?, gc_horizon: self .gc_horizon .ok_or(anyhow::anyhow!("missing gc_horizon"))?, @@ -425,7 +429,7 @@ impl PageServerConf { "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), - "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?), + "compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?), "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), @@ -561,7 +565,7 @@ impl PageServerConf { PageServerConf { id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: Duration::from_secs(10), + compaction_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: Duration::from_secs(10), wait_lsn_timeout: Duration::from_secs(60), @@ -631,7 +635,8 @@ listen_pg_addr = '127.0.0.1:64000' listen_http_addr = '127.0.0.1:9898' checkpoint_distance = 111 # in bytes -checkpoint_period = '111 s' + +compaction_period = '111 s' gc_period = '222 s' gc_horizon = 222 @@ -668,7 +673,7 @@ id = 10 listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?, + compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?, gc_horizon: defaults::DEFAULT_GC_HORIZON, gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?, wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, @@ -712,7 +717,7 @@ id = 10 listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, - checkpoint_period: Duration::from_secs(111), + compaction_period: Duration::from_secs(111), gc_horizon: 222, gc_period: Duration::from_secs(222), wait_lsn_timeout: Duration::from_secs(111), diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index f195288b9a..2c48295ad0 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -72,8 +72,6 @@ use layer_map::LayerMap; use layer_map::SearchResult; use storage_layer::{Layer, ValueReconstructResult, ValueReconstructState}; -use crate::keyspace::TARGET_FILE_SIZE_BYTES; - // re-export this function so that page_cache.rs can use it. pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file; @@ -104,7 +102,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines"; /// Repository consists of multiple timelines. Keep them in a hash table. /// pub struct LayeredRepository { - conf: &'static PageServerConf, + pub conf: &'static PageServerConf, tenantid: ZTenantId, timelines: Mutex>, // This mutex prevents creation of new timelines during GC. @@ -246,23 +244,58 @@ impl Repository for LayeredRepository { }) } - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> { + fn compaction_iteration(&self) -> Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the - // checkpoints. We don't want to block everything else while the - // checkpoint runs. + // compactions. We don't want to block everything else while the + // compaction runs. let timelines = self.timelines.lock().unwrap(); - let timelines_to_checkpoint = timelines + let timelines_to_compact = timelines .iter() .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) .collect::>(); drop(timelines); - for (timelineid, timeline) in &timelines_to_checkpoint { + for (timelineid, timeline) in &timelines_to_compact { + let _entered = + info_span!("compact", timeline = %timelineid, tenant = %self.tenantid).entered(); + match timeline { + LayeredTimelineEntry::Local(timeline) => { + timeline.compact()?; + } + LayeredTimelineEntry::Remote { .. } => { + debug!("Cannot compact remote timeline {}", timelineid) + } + } + } + + Ok(()) + } + + /// + /// Flush all in-memory data to disk. + /// + /// Used at shutdown. + /// + fn checkpoint(&self) -> Result<()> { + // Scan through the hashmap and collect a list of all the timelines, + // while holding the lock. Then drop the lock and actually perform the + // checkpoints. We don't want to block everything else while the + // checkpoint runs. + let timelines = self.timelines.lock().unwrap(); + let timelines_to_compact = timelines + .iter() + .map(|(timelineid, timeline)| (*timelineid, timeline.clone())) + .collect::>(); + drop(timelines); + + for (timelineid, timeline) in &timelines_to_compact { let _entered = info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered(); match timeline { - LayeredTimelineEntry::Local(timeline) => timeline.checkpoint(cconf)?, + LayeredTimelineEntry::Local(timeline) => { + timeline.checkpoint(CheckpointConfig::Flush)?; + } LayeredTimelineEntry::Remote { .. } => debug!( "Cannot run the checkpoint for remote timeline {}", timelineid @@ -756,9 +789,9 @@ pub struct LayeredTimeline { // Metrics histograms reconstruct_time_histo: Histogram, - checkpoint_time_histo: Histogram, - flush_checkpoint_time_histo: Histogram, - forced_checkpoint_time_histo: Histogram, + flush_time_histo: Histogram, + compact_time_histo: Histogram, + create_images_time_histo: Histogram, /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -860,23 +893,14 @@ impl Timeline for LayeredTimeline { fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { match cconf { CheckpointConfig::Flush => { - self.flush_checkpoint_time_histo - .observe_closure_duration(|| { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true) - }) + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true) } CheckpointConfig::Forced => { - self.forced_checkpoint_time_histo - .observe_closure_duration(|| { - self.freeze_inmem_layer(false); - self.flush_frozen_layers(true)?; - self.checkpoint_internal() - }) + self.freeze_inmem_layer(false); + self.flush_frozen_layers(true)?; + self.compact() } - CheckpointConfig::Distance => self - .checkpoint_time_histo - .observe_closure_duration(|| self.checkpoint_internal()), } } @@ -946,23 +970,23 @@ impl LayeredTimeline { let reconstruct_time_histo = RECONSTRUCT_TIME .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); - let checkpoint_time_histo = STORAGE_TIME + let flush_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "checkpoint", + "layer flush", &tenantid.to_string(), &timelineid.to_string(), ]) .unwrap(); - let flush_checkpoint_time_histo = STORAGE_TIME + let compact_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "flush checkpoint", + "compact", &tenantid.to_string(), &timelineid.to_string(), ]) .unwrap(); - let forced_checkpoint_time_histo = STORAGE_TIME + let create_images_time_histo = STORAGE_TIME .get_metric_with_label_values(&[ - "forced checkpoint", + "create images", &tenantid.to_string(), &timelineid.to_string(), ]) @@ -989,9 +1013,9 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), reconstruct_time_histo, - checkpoint_time_histo, - flush_checkpoint_time_histo, - forced_checkpoint_time_histo, + flush_time_histo, + compact_time_histo, + create_images_time_histo, upload_relishes: AtomicBool::new(upload_relishes), @@ -1331,37 +1355,6 @@ impl LayeredTimeline { drop(layers); } - /// - /// Flush to disk all data that was written with the put_* functions - /// - /// NOTE: This has nothing to do with checkpoint in PostgreSQL. - fn checkpoint_internal(&self) -> Result<()> { - info!("checkpoint starting"); - // Prevent concurrent checkpoints - // FIXME: This does compaction now, not the flushing of layers. - // Is this lock still needed? - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); - - // Create new image layers to allow GC and to reduce read latency - // TODO: the threshold for how often we create image layers is - // currently hard-coded at 3. It means, write out a new image layer, - // if there are at least three delta layers on top of it. - self.compact(TARGET_FILE_SIZE_BYTES as usize)?; - - // TODO: We should also compact existing delta layers here. - - // Call unload() on all frozen layers, to release memory. - // This shouldn't be much memory, as only metadata is slurped - // into memory. - let layers = self.layers.lock().unwrap(); - for layer in layers.iter_historic_layers() { - layer.unload()?; - } - drop(layers); - - Ok(()) - } - pub fn check_checkpoint_distance(self: &Arc) -> Result<()> { let last_lsn = self.get_last_record_lsn(); @@ -1402,6 +1395,8 @@ impl LayeredTimeline { } }; + let timer = self.flush_time_histo.start_timer(); + loop { let layers = self.layers.lock().unwrap(); if let Some(frozen_layer) = layers.frozen_layers.front() { @@ -1422,6 +1417,8 @@ impl LayeredTimeline { } } + timer.stop_and_record(); + Ok(()) } @@ -1525,13 +1522,13 @@ impl LayeredTimeline { Ok(()) } - fn compact(&self, target_file_size: usize) -> Result<()> { + pub fn compact(&self) -> Result<()> { // // High level strategy for compaction / image creation: // // 1. First, calculate the desired "partitioning" of the // currently in-use key space. The goal is to partition the - // key space into TARGET_FILE_SIZE chunks, but also take into + // key space into roughly fixed-size chunks, but also take into // account any existing image layers, and try to align the // chunk boundaries with the existing image layers to avoid // too much churn. Also try to align chunk boundaries with @@ -1561,10 +1558,13 @@ impl LayeredTimeline { // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. + let target_file_size = self.conf.checkpoint_distance; + // 1. The partitioning was already done by the code in // pgdatadir_mapping.rs. We just use it here. let partitioning_guard = self.partitioning.read().unwrap(); if let Some((partitioning, lsn)) = partitioning_guard.as_ref() { + let timer = self.create_images_time_histo.start_timer(); // Make a copy of the partitioning, so that we can release // the lock. Otherwise we could block the WAL receiver. let lsn = *lsn; @@ -1578,12 +1578,25 @@ impl LayeredTimeline { self.create_image_layer(partition, lsn)?; } } + timer.stop_and_record(); // 3. Compact + let timer = self.compact_time_histo.start_timer(); self.compact_level0(target_file_size)?; + timer.stop_and_record(); } else { info!("Could not compact because no partitioning specified yet"); } + + // Call unload() on all frozen layers, to release memory. + // This shouldn't be much memory, as only metadata is slurped + // into memory. + let layers = self.layers.lock().unwrap(); + for layer in layers.iter_historic_layers() { + layer.unload()?; + } + drop(layers); + Ok(()) } @@ -1643,7 +1656,7 @@ impl LayeredTimeline { Ok(()) } - fn compact_level0(&self, target_file_size: usize) -> Result<()> { + fn compact_level0(&self, target_file_size: u64) -> Result<()> { let layers = self.layers.lock().unwrap(); // We compact or "shuffle" the level-0 delta layers when 10 have @@ -1698,7 +1711,7 @@ impl LayeredTimeline { if let Some(prev_key) = prev_key { if key != prev_key && writer.is_some() { let size = writer.as_mut().unwrap().size(); - if size > target_file_size as u64 { + if size > target_file_size { new_layers.push(writer.take().unwrap().finish(prev_key.next())?); writer = None; } @@ -2032,7 +2045,7 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> { /// file format and directory layout. The test here are more low level. /// #[cfg(test)] -mod tests { +pub mod tests { use super::*; use crate::keyspace::KeySpaceAccum; use crate::repository::repo_harness::*; @@ -2072,7 +2085,7 @@ mod tests { // file size is much larger, maybe 1 GB. But a small size makes it // much faster to exercise all the logic for creating the files, // garbage collection, compaction etc. - const TEST_FILE_SIZE: usize = 4 * 1024 * 1024; + pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024; #[test] fn test_images() -> Result<()> { @@ -2088,7 +2101,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?; @@ -2096,7 +2109,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?; @@ -2104,7 +2117,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; let writer = tline.writer(); writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?; @@ -2112,7 +2125,7 @@ mod tests { drop(writer); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10")); assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10")); @@ -2165,7 +2178,7 @@ mod tests { tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; tline.gc()?; } @@ -2239,7 +2252,7 @@ mod tests { let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; tline.gc()?; } @@ -2325,7 +2338,7 @@ mod tests { let cutoff = tline.get_last_record_lsn(); tline.update_gc_info(Vec::new(), cutoff); tline.checkpoint(CheckpointConfig::Forced)?; - tline.compact(TEST_FILE_SIZE)?; + tline.compact()?; tline.gc()?; } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3c557e4e82..fd38ba9d70 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -41,8 +41,6 @@ pub const LOG_FILE_NAME: &str = "pageserver.log"; /// Config for the Repository checkpointer #[derive(Debug, Clone, Copy)] pub enum CheckpointConfig { - // Flush in-memory data that is older than this - Distance, // Flush all in-memory data Flush, // Flush all in-memory data and reconstruct all page images diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 69ef4b9868..947a2d72ab 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -729,6 +729,13 @@ impl postgres_backend::Handler for PageServerHandler { .context("Failed to fetch local timeline for checkpoint request")?; timeline.tline.checkpoint(CheckpointConfig::Forced)?; + + // Also compact it. + // + // FIXME: This probably shouldn't be part of a "checkpoint" command, but a + // separate operation. Update the tests if you change this. + timeline.tline.compact()?; + pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; } else { diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 9be8e658ca..c4661ad2d6 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -35,6 +35,8 @@ where pub tline: Arc, pub last_partitioning: AtomicLsn, pub current_logical_size: AtomicIsize, + + pub repartition_threshold: u64, } #[derive(Debug, Serialize, Deserialize)] @@ -71,11 +73,12 @@ pub struct SlruSegmentDirectory { static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); impl DatadirTimeline { - pub fn new(tline: Arc) -> Self { + pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { tline, last_partitioning: AtomicLsn::new(0), current_logical_size: AtomicIsize::new(0), + repartition_threshold, } } @@ -1178,7 +1181,7 @@ pub fn create_test_timeline( timeline_id: zenith_utils::zid::ZTimelineId, ) -> Result>> { let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?; - let tline = DatadirTimeline::new(tline); + let tline = DatadirTimeline::new(tline, crate::layered_repository::tests::TEST_FILE_SIZE / 10); let mut writer = tline.begin_record(Lsn(8)); writer.init_empty()?; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 03fbff42a8..56bd5208ca 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -194,6 +194,11 @@ pub trait Repository: Send + Sync { /// Branch a timeline fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>; + /// Flush all data to disk. + /// + /// this is used at graceful shutdown. + fn checkpoint(&self) -> Result<()>; + /// perform one garbage collection iteration, removing old data files from disk. /// this function is periodically called by gc thread. /// also it can be explicitly requested through page server api 'do_gc' command. @@ -210,9 +215,9 @@ pub trait Repository: Send + Sync { checkpoint_before_gc: bool, ) -> Result; - /// perform one checkpoint iteration, flushing in-memory data on disk. - /// this function is periodically called by checkponter thread. - fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>; + /// perform one compaction iteration. + /// this function is periodically called by compactor thread. + fn compaction_iteration(&self) -> Result<()>; } /// A timeline, that belongs to the current repository. diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index c0350fa288..38ac1a8bc4 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -9,7 +9,6 @@ use crate::thread_mgr; use crate::thread_mgr::ThreadKind; use crate::timelines; use crate::walredo::PostgresRedoManager; -use crate::CheckpointConfig; use crate::{DatadirTimelineImpl, RepositoryImpl}; use anyhow::{Context, Result}; use lazy_static::lazy_static; @@ -152,7 +151,7 @@ pub fn shutdown_all_tenants() { thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None); thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None); - thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None); + thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None); // Ok, no background threads running anymore. Flush any remaining data in // memory to disk. @@ -166,7 +165,7 @@ pub fn shutdown_all_tenants() { debug!("shutdown tenant {}", tenantid); match get_repository_for_tenant(tenantid) { Ok(repo) => { - if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) { + if let Err(err) = repo.checkpoint() { error!( "Could not checkpoint tenant {} during shutdown: {:?}", tenantid, err @@ -212,7 +211,7 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option { } /// -/// Change the state of a tenant to Active and launch its checkpointer and GC +/// Change the state of a tenant to Active and launch its compactor and GC /// threads. If the tenant was already in Active state or Stopping, does nothing. /// pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Result<()> { @@ -227,18 +226,18 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenantid: ZTenantId) -> Re // If the tenant is already active, nothing to do. TenantState::Active => {} - // If it's Idle, launch the checkpointer and GC threads + // If it's Idle, launch the compactor and GC threads TenantState::Idle => { thread_mgr::spawn( - ThreadKind::Checkpointer, + ThreadKind::Compactor, Some(tenantid), None, - "Checkpointer thread", - move || crate::tenant_threads::checkpoint_loop(tenantid, conf), + "Compactor thread", + move || crate::tenant_threads::compact_loop(tenantid, conf), )?; // FIXME: if we fail to launch the GC thread, but already launched the - // checkpointer, we're in a strange state. + // compactor, we're in a strange state. thread_mgr::spawn( ThreadKind::GarbageCollector, @@ -286,7 +285,9 @@ pub fn get_timeline_for_tenant( .local_timeline() .with_context(|| format!("cannot fetch timeline {}", timelineid))?; - let page_tline = Arc::new(DatadirTimelineImpl::new(tline)); + let repartition_distance = tenant.repo.conf.checkpoint_distance / 10; + + let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance)); page_tline.init_logical_size()?; tenant.timelines.insert(timelineid, Arc::clone(&page_tline)); Ok(page_tline) diff --git a/pageserver/src/tenant_threads.rs b/pageserver/src/tenant_threads.rs index c7fe625ecf..6c2ba479db 100644 --- a/pageserver/src/tenant_threads.rs +++ b/pageserver/src/tenant_threads.rs @@ -1,44 +1,42 @@ //! This module contains functions to serve per-tenant background processes, -//! such as checkpointer and GC +//! such as compaction and GC use crate::config::PageServerConf; use crate::repository::Repository; use crate::tenant_mgr; use crate::tenant_mgr::TenantState; -use crate::CheckpointConfig; use anyhow::Result; use std::time::Duration; use tracing::*; use zenith_utils::zid::ZTenantId; /// -/// Checkpointer thread's main loop +/// Compaction thread's main loop /// -pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { - if let Err(err) = checkpoint_loop_ext(tenantid, conf) { - error!("checkpoint loop terminated with error: {:?}", err); +pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { + if let Err(err) = compact_loop_ext(tenantid, conf) { + error!("compact loop terminated with error: {:?}", err); Err(err) } else { Ok(()) } } -fn checkpoint_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { +fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> { loop { if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) { break; } - std::thread::sleep(conf.checkpoint_period); - trace!("checkpointer thread for tenant {} waking up", tenantid); + std::thread::sleep(conf.compaction_period); + trace!("compaction thread for tenant {} waking up", tenantid); - // checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE - // bytes of WAL since last checkpoint. + // Compact timelines let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - repo.checkpoint_iteration(CheckpointConfig::Distance)?; + repo.compaction_iteration()?; } trace!( - "checkpointer thread stopped for tenant {} state is {:?}", + "compaction thread stopped for tenant {} state is {:?}", tenantid, tenant_mgr::get_tenant_state(tenantid) ); diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index ec3606b41e..46aa391241 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -92,8 +92,8 @@ pub enum ThreadKind { // Thread that connects to a safekeeper to fetch WAL for one timeline. WalReceiver, - // Thread that handles checkpointing of all timelines for a tenant. - Checkpointer, + // Thread that handles compaction of all timelines for a tenant. + Compactor, // Thread that handles GC of a tenant GarbageCollector, diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs index 7969df47b9..8ecb5f8e69 100644 --- a/pageserver/src/timelines.rs +++ b/pageserver/src/timelines.rs @@ -259,7 +259,7 @@ fn bootstrap_timeline( // Initdb lsn will be equal to last_record_lsn which will be set after import. // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. let timeline = repo.create_empty_timeline(tli, lsn)?; - let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline); + let mut page_tline: DatadirTimeline = DatadirTimeline::new(timeline, u64::MAX); import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?; page_tline.tline.checkpoint(CheckpointConfig::Forced)?; From 89690d7349d251547001fc3224fef825b98179ef Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 14:22:04 +0200 Subject: [PATCH 36/55] Prevent compaction from running at same time as GC. For same reasons as we prohibited concurrent checkpointing and GC previosly. --- pageserver/src/layered_repository.rs | 30 +++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 2c48295ad0..526d0c952f 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -806,11 +806,11 @@ pub struct LayeredTimeline { /// Used to ensure that there is only one thread layer_flush_lock: Mutex<()>, - // Prevent concurrent checkpoints. - // Checkpoints are normally performed by one thread. But checkpoint can also be manually requested by admin - // (that's used in tests), and shutdown also forces a checkpoint. These forced checkpoints run in a different thread - // and could be triggered at the same time as a normal checkpoint. - checkpoint_cs: Mutex<()>, + // Prevent concurrent compactions. + // Compactions are normally performed by one thread. But compaction can also be manually + // requested by admin (that's used in tests). These forced compactions run in a different + // thread and could be triggered at the same time as a normal, timed compaction. + compaction_cs: Mutex<()>, // Needed to ensure that we can't create a branch at a point that was already garbage collected latest_gc_cutoff_lsn: RwLock, @@ -1021,7 +1021,7 @@ impl LayeredTimeline { write_lock: Mutex::new(()), layer_flush_lock: Mutex::new(()), - checkpoint_cs: Mutex::new(()), + compaction_cs: Mutex::new(()), gc_info: RwLock::new(GcInfo { retain_lsns: Vec::new(), @@ -1557,6 +1557,7 @@ impl LayeredTimeline { // Below are functions compact_level0() and create_image_layers() // but they are a bit ad hoc and don't quite work like it's explained // above. Rewrite it. + let _compaction_cs = self.compaction_cs.lock().unwrap(); let target_file_size = self.conf.checkpoint_distance; @@ -1735,6 +1736,20 @@ impl LayeredTimeline { new_layers.push(writer.finish(prev_key.unwrap().next())?); } + // Sync layers + if !new_layers.is_empty() { + let mut layer_paths: Vec = new_layers.iter().map(|l| l.path()).collect(); + + // also sync the directory + layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); + + // Fsync all the layer files and directory using multiple threads to + // minimize latency. + par_fsync::par_fsync(&layer_paths)?; + + layer_paths.pop().unwrap(); + } + let mut layers = self.layers.lock().unwrap(); for l in new_layers { layers.insert_historic(Arc::new(l)); @@ -1783,7 +1798,8 @@ impl LayeredTimeline { let now = Instant::now(); let mut result: GcResult = Default::default(); let disk_consistent_lsn = self.get_disk_consistent_lsn(); - let _checkpoint_cs = self.checkpoint_cs.lock().unwrap(); + + let _compaction_cs = self.compaction_cs.lock().unwrap(); let gc_info = self.gc_info.read().unwrap(); let retain_lsns = &gc_info.retain_lsns; From d5a96d3d50fbc1480d29a3acbd6fe5f33da47153 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 14 Mar 2022 17:55:30 +0300 Subject: [PATCH 37/55] Fix finding end of WAL on safekeepers after f86cf93435133ee11. That commit dropped wal_start_lsn, now we're looking since commit_lsn, which is the real end of WAL if no records follow it. ref #1351 --- postgres_ffi/src/xlog_utils.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index caf1940a9c..d2b2b5c122 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -132,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz { } } +/// Return offset of the last valid record in the segment segno, starting +/// looking at start_offset. Returns start_offset if no records found. fn find_end_of_wal_segment( data_dir: &Path, segno: XLogSegNo, @@ -147,7 +149,7 @@ fn find_end_of_wal_segment( let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = 0; + let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; From 60ed6b3710d6c2f7641e2ecab4936766fbc851d5 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 19:53:00 +0200 Subject: [PATCH 38/55] Shave some CPU cycles from reading blobs from files. This shows up in 'perf' profile when running in debug mode. Not so significant in release mode, but still. --- pageserver/src/layered_repository/inmemory_layer.rs | 4 +++- pageserver/src/layered_repository/utils.rs | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 577e562115..dc1177e76c 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -335,11 +335,13 @@ impl InMemoryLayer { self.start_lsn..inner.end_lsn.unwrap(), )?; + let mut buf = Vec::new(); let mut do_steps = || -> Result<()> { for (key, vec_map) in inner.index.iter() { // Write all page versions for (lsn, pos) in vec_map.as_slice() { - let val = Value::des(&utils::read_blob(&inner.file, *pos)?)?; + let len = utils::read_blob_buf(&inner.file, *pos, &mut buf)?; + let val = Value::des(&buf[0..len])?; delta_layer_writer.put_value(*key, *lsn, val)?; } } diff --git a/pageserver/src/layered_repository/utils.rs b/pageserver/src/layered_repository/utils.rs index de6303ce35..b3aa8c7ef4 100644 --- a/pageserver/src/layered_repository/utils.rs +++ b/pageserver/src/layered_repository/utils.rs @@ -4,17 +4,22 @@ use std::os::unix::fs::FileExt; use bookfile::BoundedReader; -pub fn read_blob(file: &F, off: u64) -> Result, Error> { +pub fn read_blob_buf(file: &F, off: u64, buf: &mut Vec) -> Result { // read length let mut len_buf = [0u8; 4]; file.read_exact_at(&mut len_buf, off)?; - let len = u32::from_ne_bytes(len_buf); + let len = u32::from_ne_bytes(len_buf) as usize; - let mut buf: Vec = Vec::new(); - buf.resize(len as usize, 0); + buf.resize(len, 0); file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?; + Ok(len) +} + +pub fn read_blob(file: &F, off: u64) -> Result, Error> { + let mut buf: Vec = Vec::new(); + let _ = read_blob_buf(file, off, &mut buf); Ok(buf) } From 6a264aaca315f0ba452808eca070e1deabf82933 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 19:54:38 +0200 Subject: [PATCH 39/55] Stopgap "fix" for test_parallel_copy failure in debug mode. --- .../src/layered_repository/delta_layer.rs | 10 ++++++++++ .../src/layered_repository/image_layer.rs | 17 +++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 8a9c6dc34d..82dd516990 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -219,6 +219,16 @@ impl Layer for DeltaLayer { /// it will need to be loaded back. /// fn unload(&self) -> Result<()> { + // FIXME: In debug mode, loading and unloading the index slows + // things down so much that you get timeout errors. At least + // with the test_parallel_copy test. So as an even more ad hoc + // stopgap fix for that, only unload every on average 10 + // checkpoint cycles. + use rand::RngCore; + if rand::thread_rng().next_u32() > (u32::MAX / 10) { + return Ok(()); + } + if let Ok(mut inner) = self.inner.try_write() { inner.index = HashMap::default(); inner.loaded = false; diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 550bcda8f7..2fa6bb8eee 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -170,8 +170,21 @@ impl Layer for ImageLayer { } fn unload(&self) -> Result<()> { - // TODO: unload 'segs'. Or even better, don't hold it in memory but - // access it directly from the file (using the buffer cache) + // Unload the index. + // + // TODO: we should access the index directly from pages on the disk, + // using the buffer cache. This load/unload mechanism is really ad hoc. + + // FIXME: In debug mode, loading and unloading the index slows + // things down so much that you get timeout errors. At least + // with the test_parallel_copy test. So as an even more ad hoc + // stopgap fix for that, only unload every on average 10 + // checkpoint cycles. + use rand::RngCore; + if rand::thread_rng().next_u32() > (u32::MAX / 10) { + return Ok(()); + } + let mut inner = self.inner.lock().unwrap(); inner.index = HashMap::default(); inner.loaded = false; From 9c1a9a1d9f315adac161e5490b314dde63e3e292 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Mon, 14 Mar 2022 20:06:25 +0200 Subject: [PATCH 40/55] Update Cargo.lock for new dependencies (#1354) Commit b2ad8342d2 added dependency on 'criterion', which pulled along some other crates. --- Cargo.lock | 183 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index ad38a41d91..b1ebe6c07a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -260,6 +260,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.9.1" @@ -281,6 +293,15 @@ dependencies = [ "serde", ] +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + [[package]] name = "cc" version = "1.0.72" @@ -447,6 +468,76 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.7" @@ -477,6 +568,28 @@ dependencies = [ "subtle", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "daemonize" version = "0.4.1" @@ -1260,6 +1373,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -1444,6 +1563,34 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + [[package]] name = "postgres" version = "0.19.1" @@ -1664,6 +1811,31 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rcgen" version = "0.8.14" @@ -2233,6 +2405,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.5.1" @@ -2855,6 +3037,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "criterion", "git-version", "hex", "hex-literal", From 7560854370854ffce33667576782b580c7875ac4 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 16 Mar 2022 19:29:07 +0200 Subject: [PATCH 41/55] Rename things in KeyPartition, per Bojan's suggestions. --- pageserver/src/keyspace.rs | 25 +++++++++++++++---------- pageserver/src/layered_repository.rs | 21 +++++++++++---------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/pageserver/src/keyspace.rs b/pageserver/src/keyspace.rs index 274c858338..aeaceb37fc 100644 --- a/pageserver/src/keyspace.rs +++ b/pageserver/src/keyspace.rs @@ -8,10 +8,11 @@ pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB /// /// Represents a set of Keys, in a compact form. /// +#[derive(Clone, Debug)] pub struct KeySpace { // Contiguous ranges of keys that belong to the key space. In key order, and // with no overlap. - ranges: Vec>, + pub ranges: Vec>, } impl KeySpace { @@ -23,7 +24,7 @@ impl KeySpace { // Assume that each value is 8k in size. let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize; - let mut partitions = Vec::new(); + let mut parts = Vec::new(); let mut current_part = Vec::new(); let mut current_part_size: usize = 0; for range in &self.ranges { @@ -31,7 +32,9 @@ impl KeySpace { // partition would cause it to be too large, start a new partition. let this_size = key_range_size(range) as usize; if current_part_size + this_size > target_nblocks && !current_part.is_empty() { - partitions.push(current_part); + parts.push(KeySpace { + ranges: current_part, + }); current_part = Vec::new(); current_part_size = 0; } @@ -42,7 +45,9 @@ impl KeySpace { let mut start = range.start; while remain_size > target_nblocks { let next = start.add(target_nblocks as u32); - partitions.push(vec![start..next]); + parts.push(KeySpace { + ranges: vec![start..next], + }); start = next; remain_size -= target_nblocks } @@ -52,10 +57,12 @@ impl KeySpace { // add last partition that wasn't full yet. if !current_part.is_empty() { - partitions.push(current_part); + parts.push(KeySpace { + ranges: current_part, + }); } - KeyPartitioning { partitions } + KeyPartitioning { parts } } } @@ -68,14 +75,12 @@ impl KeySpace { /// #[derive(Clone, Debug, Default)] pub struct KeyPartitioning { - pub partitions: Vec>>, + pub parts: Vec, } impl KeyPartitioning { pub fn new() -> Self { - KeyPartitioning { - partitions: Vec::new(), - } + KeyPartitioning { parts: Vec::new() } } } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 526d0c952f..99671c350a 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -34,7 +34,7 @@ use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; -use crate::keyspace::KeyPartitioning; +use crate::keyspace::{KeyPartitioning, KeySpace}; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, @@ -1569,14 +1569,14 @@ impl LayeredTimeline { // Make a copy of the partitioning, so that we can release // the lock. Otherwise we could block the WAL receiver. let lsn = *lsn; - let partitions = partitioning.partitions.clone(); + let parts = partitioning.parts.clone(); drop(partitioning_guard); // 2. Create new image layers for partitions that have been modified // "enough". - for partition in partitions.iter() { - if self.time_for_new_image_layer(partition, lsn, 3)? { - self.create_image_layer(partition, lsn)?; + for part in parts.iter() { + if self.time_for_new_image_layer(part, lsn, 3)? { + self.create_image_layer(part, lsn)?; } } timer.stop_and_record(); @@ -1604,13 +1604,13 @@ impl LayeredTimeline { // Is it time to create a new image layer for the given partition? fn time_for_new_image_layer( &self, - partition: &[Range], + partition: &KeySpace, lsn: Lsn, threshold: usize, ) -> Result { let layers = self.layers.lock().unwrap(); - for part_range in partition { + for part_range in &partition.ranges { let image_coverage = layers.image_coverage(part_range, lsn)?; for (img_range, last_img) in image_coverage { let img_lsn = if let Some(ref last_img) = last_img { @@ -1634,12 +1634,13 @@ impl LayeredTimeline { Ok(false) } - fn create_image_layer(&self, partition: &[Range], lsn: Lsn) -> Result<()> { - let img_range = partition.first().unwrap().start..partition.last().unwrap().end; + fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> Result<()> { + let img_range = + partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end; let mut image_layer_writer = ImageLayerWriter::new(self.conf, self.timelineid, self.tenantid, &img_range, lsn)?; - for range in partition { + for range in &partition.ranges { let mut key = range.start; while key < range.end { let img = self.get(key, lsn)?; From 705f51db2777228e3e61db77573625cc9929585c Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 16 Mar 2022 21:20:04 +0300 Subject: [PATCH 42/55] [proxy] Propagate some errors to user (#1329) * [proxy] Propagate most errors to user This change enables propagation of most errors to the user (e.g. auth and connectivity errors). Some of them will be stripped of sensitive information. As a side effect, most occurrences of `anyhow::Error` were replaced with concrete error types. * [proxy] Box weighty errors --- Cargo.lock | 2 + proxy/Cargo.toml | 2 + proxy/src/auth.rs | 124 ++++++++++++++++++++------- proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 61 +++++++++++-- proxy/src/config.rs | 6 +- proxy/src/cplane_api.rs | 111 ++++++++++++++++++------ proxy/src/error.rs | 17 ++++ proxy/src/http.rs | 2 +- proxy/src/main.rs | 3 +- proxy/src/mgmt.rs | 20 +++-- proxy/src/proxy.rs | 176 +++++++++++++++++++++++--------------- proxy/src/stream.rs | 84 +++++++++++++++--- proxy/src/waiters.rs | 37 ++++++-- 14 files changed, 481 insertions(+), 166 deletions(-) create mode 100644 proxy/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index b1ebe6c07a..750ac0edc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1739,6 +1739,7 @@ dependencies = [ "anyhow", "bytes", "clap 3.0.14", + "fail", "futures", "hashbrown 0.11.2", "hex", @@ -1754,6 +1755,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8d5cbe5bf..dda018a1d8 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" +fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" @@ -21,6 +22,7 @@ rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +thiserror = "1.0" tokio = { version = "1.11", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a5bdaeaeca..5e6357fe80 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,79 @@ use crate::compute::DatabaseInfo; use crate::config::ProxyConfig; use crate::cplane_api::{self, CPlaneApi}; +use crate::error::UserFacingError; use crate::stream::PqStream; -use anyhow::{anyhow, bail, Context}; +use crate::waiters; use std::collections::HashMap; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe}; +use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +/// Common authentication error. +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error(transparent)] + Console(#[from] cplane_api::AuthError), + + /// For passwords that couldn't be processed by [`parse_password`]. + #[error("Malformed password message")] + MalformedPassword, + + /// Errors produced by [`PqStream`]. + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl AuthErrorImpl { + pub fn auth_failed(msg: impl Into) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::RegisterError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::WaitError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + Console(e) => e.to_string_client(), + MalformedPassword => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +#[derive(Debug, Error)] +pub enum ClientCredsParseError { + #[error("Parameter `{0}` is missing in startup packet")] + MissingKey(&'static str), +} + +impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. #[derive(Debug, PartialEq, Eq)] @@ -15,13 +83,13 @@ pub struct ClientCredentials { } impl TryFrom> for ClientCredentials { - type Error = anyhow::Error; + type Error = ClientCredsParseError; fn try_from(mut value: HashMap) -> Result { let mut get_param = |key| { value .remove(key) - .with_context(|| format!("{} is missing in startup packet", key)) + .ok_or(ClientCredsParseError::MissingKey(key)) }; let user = get_param("user")?; @@ -37,10 +105,14 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> anyhow::Result { + ) -> Result { + fail::fail_point!("proxy-authenticate", |_| { + Err(AuthError::auth_failed("failpoint triggered")) + }); + use crate::config::ClientAuthMethod::*; use crate::config::RouterConfig::*; - let db_info = match &config.router_config { + match &config.router_config { Static { host, port } => handle_static(host.clone(), *port, client, self).await, Dynamic(Mixed) => { if self.user.ends_with("@zenith") { @@ -51,9 +123,7 @@ impl ClientCredentials { } Dynamic(Password) => handle_existing_user(config, client, self).await, Dynamic(Link) => handle_new_user(config, client).await, - }; - - db_info.context("failed to authenticate client") + } } } @@ -66,18 +136,14 @@ async fn handle_static( port: u16, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { client .write_message(&Be::AuthenticationCleartextPassword) .await?; // Read client's password bytes - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap(); + let msg = client.read_password_message().await?; + let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; let db_info = DatabaseInfo { host, @@ -98,7 +164,7 @@ async fn handle_existing_user( config: &ProxyConfig, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let md5_salt = rand::random(); @@ -107,18 +173,12 @@ async fn handle_existing_user( .await?; // Read client's password hash - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - let (_trailing_null, md5_response) = msg - .split_last() - .ok_or_else(|| anyhow!("unexpected password message"))?; - - let cplane = CPlaneApi::new(&config.auth_endpoint); + let cplane = CPlaneApi::new(config.auth_endpoint.clone()); let db_info = cplane - .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id) + .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) .await?; client @@ -131,7 +191,7 @@ async fn handle_existing_user( async fn handle_new_user( config: &ProxyConfig, client: &mut PqStream, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let greeting = hello_message(&config.redirect_uri, &psql_session_id); @@ -143,8 +203,8 @@ async fn handle_new_user( .write_message(&Be::NoticeResponse(greeting)) .await?; - // Wait for web console response - waiter.await?.map_err(|e| anyhow!(e)) + // Wait for web console response (see `mgmt`) + waiter.await?.map_err(AuthErrorImpl::auth_failed) }) .await?; @@ -153,6 +213,10 @@ async fn handle_new_user( Ok(db_info) } +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c1a7e81be9..07d3bcc71a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,7 +6,7 @@ use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; use zenith_utils::pq_proto::CancelKeyData; -/// Enables serving CancelRequests. +/// Enables serving `CancelRequest`s. #[derive(Default)] pub struct CancelMap(Mutex>>); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c294bd488..64ce5d0a5a 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,27 @@ -use anyhow::Context; +use crate::cancellation::CancelClosure; +use crate::error::UserFacingError; use serde::{Deserialize, Serialize}; -use std::net::{SocketAddr, ToSocketAddrs}; +use std::io; +use std::net::SocketAddr; +use thiserror::Error; +use tokio::net::TcpStream; +use tokio_postgres::NoTls; + +#[derive(Debug, Error)] +pub enum ConnectionError { + /// This error doesn't seem to reveal any secrets; for instance, + /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + #[error("Failed to connect to the compute node: {0}")] + Postgres(#[from] tokio_postgres::Error), + + #[error("Failed to connect to the compute node")] + FailedToConnectToCompute, + + #[error("Failed to fetch compute node version")] + FailedToFetchPgVersion, +} + +impl UserFacingError for ConnectionError {} /// Compute node connection params. #[derive(Serialize, Deserialize, Debug, Default)] @@ -12,14 +33,38 @@ pub struct DatabaseInfo { pub password: Option, } +/// PostgreSQL version as [`String`]. +pub type Version = String; + impl DatabaseInfo { - pub fn socket_addr(&self) -> anyhow::Result { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { let host_port = format!("{}:{}", self.host, self.port); - host_port - .to_socket_addrs() - .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? - .next() - .context("cannot resolve at least one SocketAddr") + let socket = TcpStream::connect(host_port).await?; + let socket_addr = socket.peer_addr()?; + + Ok((socket_addr, socket)) + } + + /// Connect to a corresponding compute node. + pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { + let (socket_addr, mut socket) = self + .connect_raw() + .await + .map_err(|_| ConnectionError::FailedToConnectToCompute)?; + + // TODO: establish a secure connection to the DB + let (client, conn) = tokio_postgres::Config::from(self) + .connect_raw(&mut socket, NoTls) + .await?; + + let version = conn + .parameter("server_version") + .ok_or(ConnectionError::FailedToFetchPgVersion)? + .into(); + + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + + Ok((socket, version, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9ab64db795..077ff02898 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, ensure, Context}; +use anyhow::{anyhow, bail, ensure, Context}; use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; use std::net::SocketAddr; use std::str::FromStr; @@ -29,7 +29,7 @@ impl FromStr for ClientAuthMethod { "password" => Ok(Password), "link" => Ok(Link), "mixed" => Ok(Mixed), - _ => Err(anyhow::anyhow!("Invlid option for router")), + _ => bail!("Invalid option for router: `{}`", s), } } } @@ -53,7 +53,7 @@ pub struct ProxyConfig { pub redirect_uri: String, /// control plane address where we would check auth. - pub auth_endpoint: String, + pub auth_endpoint: reqwest::Url, pub tls_config: Option, } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs index 187809717f..21fce79df3 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cplane_api.rs @@ -1,52 +1,113 @@ use crate::auth::ClientCredentials; use crate::compute::DatabaseInfo; -use crate::waiters::{Waiter, Waiters}; -use anyhow::{anyhow, bail}; +use crate::error::UserFacingError; +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use thiserror::Error; lazy_static! { - static ref CPLANE_WAITERS: Waiters> = Default::default(); + static ref CPLANE_WAITERS: Waiters = Default::default(); } /// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter(psql_session_id: impl Into, f: F) -> anyhow::Result +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result where - F: FnOnce(Waiter<'static, Result>) -> R, - R: std::future::Future>, + R: std::future::Future>, + E: From, { let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - f(waiter).await + action(waiter).await } -pub fn notify(psql_session_id: &str, msg: Result) -> anyhow::Result<()> { +pub fn notify( + psql_session_id: &str, + msg: Result, +) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Zenith console API wrapper. -pub struct CPlaneApi<'a> { - auth_endpoint: &'a str, +pub struct CPlaneApi { + auth_endpoint: reqwest::Url, } -impl<'a> CPlaneApi<'a> { - pub fn new(auth_endpoint: &'a str) -> Self { +impl CPlaneApi { + pub fn new(auth_endpoint: reqwest::Url) -> Self { Self { auth_endpoint } } } -impl CPlaneApi<'_> { - pub async fn authenticate_proxy_request( +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +impl CPlaneApi { + pub async fn authenticate_proxy_client( &self, creds: ClientCredentials, - md5_response: &[u8], + md5_response: &str, salt: &[u8; 4], psql_session_id: &str, - ) -> anyhow::Result { - let mut url = reqwest::Url::parse(self.auth_endpoint)?; + ) -> Result { + let mut url = self.auth_endpoint.clone(); url.query_pairs_mut() .append_pair("login", &creds.user) .append_pair("database", &creds.dbname) - .append_pair("md5response", std::str::from_utf8(md5_response)?) + .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -55,18 +116,20 @@ impl CPlaneApi<'_> { // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { - bail!("Auth failed: {}", resp.status()) + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; println!("got auth info: #{:?}", auth_info); use ProxyAuthResponse::*; - match auth_info { - Ready { conn_info } => Ok(conn_info), - Error { error } => bail!(error), - NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)), - } + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) }) .await } diff --git a/proxy/src/error.rs b/proxy/src/error.rs new file mode 100644 index 0000000000..e98e553f83 --- /dev/null +++ b/proxy/src/error.rs @@ -0,0 +1,17 @@ +/// Marks errors that may be safely shown to a client. +/// This trait can be seen as a specialized version of [`ToString`]. +/// +/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it +/// is way too convenient and tends to proliferate all across the codebase, +/// ultimately leading to accidental leaks of sensitive data. +pub trait UserFacingError: ToString { + /// Format the error for client, stripping all sensitive info. + /// + /// Although this might be a no-op for many types, it's highly + /// recommended to override the default impl in case error type + /// contains anything sensitive: various IDs, IP addresses etc. + #[inline(always)] + fn to_string_client(&self) -> String { + self.to_string() + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 0b693d88dd..33d134678f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -7,7 +7,7 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::{RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) + json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { diff --git a/proxy/src/main.rs b/proxy/src/main.rs index de618ccde9..bd99d0a639 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,13 +20,14 @@ mod cancellation; mod compute; mod config; mod cplane_api; +mod error; mod http; mod mgmt; mod proxy; mod stream; mod waiters; -/// Flattens Result> into Result. +/// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, ) -> anyhow::Result<()> { diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 55b49b441f..e53542dfd2 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -79,6 +79,18 @@ enum PsqlSessionResult { Failure(String), } +/// A message received by `mgmt` when a compute node is ready. +pub type ComputeReady = Result; + +impl PsqlSessionResult { + fn into_compute_ready(self) -> ComputeReady { + match self { + Self::Success(db_info) => Ok(db_info), + Self::Failure(message) => Err(message), + } + } +} + impl postgres_backend::Handler for MgmtHandler { fn process_query( &mut self, @@ -99,13 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - use PsqlSessionResult::*; - let msg = match resp.result { - Success(db_info) => Ok(db_info), - Failure(message) => Err(message), - }; - - match cplane_api::notify(&resp.session_id, msg) { + match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1dc301b792..3c7f59bc26 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,17 +1,18 @@ use crate::auth; -use crate::cancellation::{self, CancelClosure, CancelMap}; -use crate::compute::DatabaseInfo; +use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; +use futures::TryFutureExt; use lazy_static::lazy_static; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio_postgres::NoTls; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_utils::pq_proto::{BeMessage as Be, *}; +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; +const ERR_PROTO_VIOLATION: &str = "protocol violation"; + lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( new_common_metric_name("num_connections_accepted"), @@ -30,6 +31,7 @@ lazy_static! { .unwrap(); } +/// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output where F: std::future::Future>, @@ -76,20 +78,21 @@ async fn handle_client( } let tls = config.tls_config.clone(); - if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? { - cancel_map - .with_session(|session| async { - connect_client_to_db(config, session, client, creds).await - }) - .await?; - } + let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; - Ok(()) + let client = Client::new(stream, creds); + cancel_map + .with_session(|session| client.connect_to_db(config, session)) + .await } -/// Handle a connection from one client. -/// For better testing experience, `stream` can be -/// any object satisfying the traits. +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, mut tls: Option, @@ -119,7 +122,7 @@ async fn handshake( stream = PqStream::new(stream.into_inner().upgrade(tls).await?); } } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { @@ -128,18 +131,21 @@ async fn handshake( // Currently, we don't support GSSAPI stream.write_message(&Be::EncryptionResponse(false)).await?; } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, StartupMessage { params, .. } => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { - let msg = "connection is insecure (try using `sslmode=require`)"; - stream.write_message(&Be::ErrorResponse(msg)).await?; - bail!(msg); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - break Ok(Some((stream, params.try_into()?))); + // Here and forth: `or_else` demands that we use a future here + let creds = async { params.try_into() } + .or_else(|e| stream.throw_error(e)) + .await?; + + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -150,58 +156,60 @@ async fn handshake( } } -async fn connect_client_to_db( - config: &ProxyConfig, - session: cancellation::Session<'_>, - mut client: PqStream, +/// Thin connection context. +struct Client { + /// The underlying libpq protocol stream. + stream: PqStream, + /// Client credentials that we care about. creds: auth::ClientCredentials, -) -> anyhow::Result<()> { - let db_info = creds.authenticate(config, &mut client).await?; - let (db, version, cancel_closure) = connect_to_db(db_info).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); - - client - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), - ))? - .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - // This function will be called for writes to either direction. - fn inc_proxied(cnt: usize) { - // Consider inventing something more sophisticated - // if this ever becomes a bottleneck (cacheline bouncing). - NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); - } - - let mut db = MetricsStream::new(db, inc_proxied); - let mut client = MetricsStream::new(client.into_inner(), inc_proxied); - let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; - - Ok(()) } -/// Connect to a corresponding compute node. -async fn connect_to_db( - db_info: DatabaseInfo, -) -> anyhow::Result<(TcpStream, String, CancelClosure)> { - // TODO: establish a secure connection to the DB - let socket_addr = db_info.socket_addr()?; - let mut socket = TcpStream::connect(socket_addr).await?; +impl Client { + /// Construct a new connection context. + fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + Self { stream, creds } + } +} - let (client, conn) = tokio_postgres::Config::from(db_info) - .connect_raw(&mut socket, NoTls) - .await?; +impl Client { + /// Let the client authenticate and connect to the designated compute node. + async fn connect_to_db( + self, + config: &ProxyConfig, + session: cancellation::Session<'_>, + ) -> anyhow::Result<()> { + let Self { mut stream, creds } = self; - let version = conn - .parameter("server_version") - .context("failed to fetch postgres server version")? - .into(); + // Authenticate and connect to a compute node. + let auth = creds.authenticate(config, &mut stream).await; + let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let (db, version, cancel_closure) = + db_info.connect().or_else(|e| stream.throw_error(e)).await?; + let cancel_key_data = session.enable_cancellation(cancel_closure); - Ok((socket, version, cancel_closure)) + stream + .write_message_noflush(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion(&version), + ))? + .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + /// This function will be called for writes to either direction. + fn inc_proxied(cnt: usize) { + // Consider inventing something more sophisticated + // if this ever becomes a bottleneck (cacheline bouncing). + NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); + } + + // Starting from here we only proxy the client's traffic. + let mut db = MetricsStream::new(db, inc_proxied); + let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); + let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; + + Ok(()) + } } #[cfg(test)] @@ -210,7 +218,7 @@ mod tests { use tokio::io::DuplexStream; use tokio_postgres::config::SslMode; - use tokio_postgres::tls::MakeTlsConnect; + use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; async fn dummy_proxy( @@ -264,7 +272,7 @@ mod tests { let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - tokio_postgres::Config::new() + let client_err = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -273,11 +281,15 @@ mod tests { .err() // -> Option .context("client shouldn't be able to connect")?; - proxy + assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); + + let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; + assert!(client_err.to_string().contains(&server_err.to_string())); + Ok(()) } @@ -329,4 +341,30 @@ mod tests { proxy.await? } + + #[tokio::test] + async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let proxy = tokio::spawn(dummy_proxy(client, None)); + + let client_err = tokio_postgres::Config::new() + .ssl_mode(SslMode::Disable) + .connect_raw(server, NoTls) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + // TODO: this is ugly, but `format!` won't allow us to extract fmt string + assert!(client_err.to_string().contains("missing in startup packet")); + + let server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + assert!(client_err.to_string().contains(&server_err.to_string())); + + Ok(()) + } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8fd5bef388..fb0be84584 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,10 +1,12 @@ -use anyhow::Context; +use crate::error::UserFacingError; +use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; @@ -35,38 +37,63 @@ impl PqStream { self.stream } - /// Get a reference to the underlying stream. + /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } } +fn err_connection() -> io::Error { + io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") +} + +// TODO: change error type of `FeMessage::read_fut` +fn from_anyhow(e: anyhow::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. - pub async fn read_startup_packet(&mut self) -> anyhow::Result { - match FeStartupPacket::read_fut(&mut self.stream).await? { - Some(FeMessage::StartupPacket(packet)) => Ok(packet), - None => anyhow::bail!("connection is lost"), - other => anyhow::bail!("bad message type: {:?}", other), + pub async fn read_startup_packet(&mut self) -> io::Result { + // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` + let msg = FeStartupPacket::read_fut(&mut self.stream) + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection)?; + + match msg { + FeMessage::StartupPacket(packet) => Ok(packet), + _ => panic!("unreachable state"), } } - pub async fn read_message(&mut self) -> anyhow::Result { + pub async fn read_password_message(&mut self) -> io::Result { + match self.read_message().await? { + FeMessage::PasswordMessage(msg) => Ok(msg), + bad => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected message type: {:?}", bad), + )), + } + } + + async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) - .await? - .context("connection is lost") + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection) } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buffer, message)?; Ok(self) } /// Write the message into an internal buffer and flush it. - pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) @@ -79,6 +106,25 @@ impl PqStream { self.stream.flush().await?; Ok(self) } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Allowing string literals is safe under the assumption they might not contain any runtime info. + pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { + // This method exists due to `&str` not implementing `Into` + self.write_message(&BeMessage::ErrorResponse(error)).await?; + bail!(error) + } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Trait [`UserFacingError`] acts as an allowlist for error types. + pub async fn throw_error(&mut self, error: E) -> anyhow::Result + where + E: UserFacingError + Into, + { + let msg = error.to_string_client(); + self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + bail!(error) + } } pin_project! { @@ -101,15 +147,25 @@ impl Stream { } } +#[derive(Debug, Error)] +#[error("Can't upgrade TLS stream")] +pub enum StreamUpgradeError { + #[error("Bad state reached: can't upgrade TLS stream")] + AlreadyTls, + + #[error("Can't upgrade stream: IO error: {0}")] + Io(#[from] io::Error), +} + impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> anyhow::Result { + pub async fn upgrade(self, cfg: Arc) -> Result { match self { Stream::Raw { raw } => { let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); Ok(Stream::Tls { tls }) } - Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"), + Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 9fda3ed94f..799d45a165 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,11 +1,32 @@ -use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use std::pin::Pin; use std::task; +use thiserror::Error; use tokio::sync::oneshot; +#[derive(Debug, Error)] +pub enum RegisterError { + #[error("Waiter `{0}` already registered")] + Occupied(String), +} + +#[derive(Debug, Error)] +pub enum NotifyError { + #[error("Notify failed: waiter `{0}` not registered")] + NotFound(String), + + #[error("Notify failed: channel hangup")] + Hangup, +} + +#[derive(Debug, Error)] +pub enum WaitError { + #[error("Wait failed: channel hangup")] + Hangup, +} + pub struct Waiters(pub(self) Mutex>>); impl Default for Waiters { @@ -15,13 +36,13 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> anyhow::Result> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) - .map_err(|_| anyhow!("waiter already registered"))?; + .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, @@ -32,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()> + pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -40,9 +61,9 @@ impl Waiters { .0 .lock() .remove(key) - .with_context(|| format!("key {} not found", key))?; + .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; - tx.send(value).map_err(|_| anyhow!("waiter channel hangup")) + tx.send(value).map_err(|_| NotifyError::Hangup) } } @@ -66,13 +87,13 @@ pin_project! { } impl std::future::Future for Waiter<'_, T> { - type Output = anyhow::Result; + type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) - .map_err(|_| anyhow!("channel hangup")) + .map_err(|_| WaitError::Hangup) } } From 15a2a2bf0446653e6a737b932b3c11f616ec20ec Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Wed, 16 Mar 2022 23:00:01 +0300 Subject: [PATCH 43/55] release 2202-03-16 (#1373) production deploy --- .circleci/config.yml | 30 +- .github/workflows/benchmarking.yml | 2 +- Cargo.lock | 186 ++++++ Dockerfile | 86 +-- Dockerfile.build | 33 +- README.md | 18 +- compute_tools/src/pg_helpers.rs | 2 +- compute_tools/src/spec.rs | 2 +- control_plane/Cargo.toml | 1 + control_plane/safekeepers.conf | 6 +- control_plane/simple.conf | 2 +- control_plane/src/compute.rs | 64 +- control_plane/src/local_env.rs | 191 ++++-- control_plane/src/safekeeper.rs | 38 +- control_plane/src/storage.rs | 123 ++-- docker-entrypoint.sh | 2 +- docs/docker.md | 24 +- docs/rfcs/002-storage.md | 186 ++++++ docs/rfcs/003-laptop-cli.md | 267 ++++++++ docs/rfcs/004-durability.md | 218 +++++++ docs/rfcs/005-zenith_local.md | 103 +++ docs/rfcs/006-laptop-cli-v2-CLI.md | 64 ++ .../006-laptop-cli-v2-repository-structure.md | 140 ++++ docs/rfcs/007-serverless-on-laptop.md | 93 +++ docs/rfcs/008-push-pull.md | 66 ++ docs/rfcs/009-snapshot-first-storage-cli.md | 56 ++ docs/rfcs/009-snapshot-first-storage-pitr.md | 227 +++++++ docs/rfcs/009-snapshot-first-storage.md | 148 +++++ docs/rfcs/010-storage_details.md | 144 ++++ docs/rfcs/011-retention-policy.md | 91 +++ docs/rfcs/012-background-tasks.md | 38 ++ docs/rfcs/013-term-history.md | 147 +++++ docs/rfcs/README.md | 95 +++ docs/rfcs/images/storage.jpeg | Bin 0 -> 431075 bytes pageserver/src/bin/pageserver.rs | 44 +- pageserver/src/branches.rs | 428 ------------ pageserver/src/config.rs | 301 +++++++-- pageserver/src/http/models.rs | 125 +++- pageserver/src/http/openapi_spec.yml | 221 ++----- pageserver/src/http/routes.rs | 237 +++---- pageserver/src/layered_repository.rs | 147 +++-- .../src/layered_repository/inmemory_layer.rs | 9 +- pageserver/src/lib.rs | 2 +- pageserver/src/page_service.rs | 10 +- pageserver/src/remote_storage/README.md | 8 - pageserver/src/remote_storage/storage_sync.rs | 62 +- .../remote_storage/storage_sync/download.rs | 100 +-- .../src/remote_storage/storage_sync/index.rs | 37 +- .../src/remote_storage/storage_sync/upload.rs | 94 +-- pageserver/src/repository.rs | 23 +- pageserver/src/tenant_mgr.rs | 39 +- pageserver/src/timelines.rs | 408 ++++++++++++ pageserver/src/walrecord.rs | 12 +- postgres_ffi/src/xlog_utils.rs | 4 +- proxy/Cargo.toml | 2 + proxy/src/auth.rs | 124 +++- proxy/src/cancellation.rs | 2 +- proxy/src/compute.rs | 61 +- proxy/src/config.rs | 6 +- proxy/src/cplane_api.rs | 111 +++- proxy/src/error.rs | 17 + proxy/src/http.rs | 2 +- proxy/src/main.rs | 5 +- proxy/src/mgmt.rs | 20 +- proxy/src/proxy.rs | 176 +++-- proxy/src/stream.rs | 84 ++- proxy/src/waiters.rs | 37 +- test_runner/README.md | 2 +- test_runner/batch_others/test_auth.py | 27 +- test_runner/batch_others/test_backpressure.py | 4 +- .../batch_others/test_branch_behind.py | 33 +- .../batch_others/test_clog_truncate.py | 8 +- test_runner/batch_others/test_createdropdb.py | 16 +- test_runner/batch_others/test_createuser.py | 6 +- test_runner/batch_others/test_multixact.py | 4 +- test_runner/batch_others/test_next_xid.py | 2 +- .../batch_others/test_pageserver_api.py | 43 +- .../batch_others/test_pageserver_catchup.py | 4 +- .../batch_others/test_pageserver_restart.py | 4 +- .../batch_others/test_parallel_copy.py | 2 - test_runner/batch_others/test_proxy.py | 13 + .../batch_others/test_readonly_node.py | 23 +- .../batch_others/test_remote_storage.py | 8 +- .../batch_others/test_restart_compute.py | 5 +- .../batch_others/test_tenant_relocation.py | 14 +- test_runner/batch_others/test_tenants.py | 26 +- .../batch_others/test_timeline_size.py | 17 +- test_runner/batch_others/test_twophase.py | 1 - test_runner/batch_others/test_wal_acceptor.py | 116 ++-- .../batch_others/test_wal_acceptor_async.py | 4 +- test_runner/batch_others/test_zenith_cli.py | 65 +- test_runner/fixtures/compare_fixtures.py | 5 +- test_runner/fixtures/zenith_fixtures.py | 434 ++++++++----- .../performance/test_bulk_tenant_create.py | 15 +- vendor/postgres | 2 +- walkeeper/src/bin/safekeeper.rs | 88 ++- walkeeper/src/control_file.rs | 104 ++- walkeeper/src/control_file_upgrade.rs | 82 ++- walkeeper/src/handler.rs | 38 +- walkeeper/src/http/mod.rs | 1 + walkeeper/src/http/models.rs | 9 + walkeeper/src/http/routes.rs | 44 +- walkeeper/src/lib.rs | 4 +- walkeeper/src/safekeeper.rs | 168 +++-- walkeeper/src/timeline.rs | 128 ++-- walkeeper/src/wal_storage.rs | 23 +- zenith/src/main.rs | 614 ++++++++++++------ zenith_utils/Cargo.toml | 5 + zenith_utils/benches/benchmarks.rs | 22 + zenith_utils/src/auth.rs | 44 +- zenith_utils/src/zid.rs | 229 +++++-- 111 files changed, 5846 insertions(+), 2511 deletions(-) create mode 100644 docs/rfcs/002-storage.md create mode 100644 docs/rfcs/003-laptop-cli.md create mode 100644 docs/rfcs/004-durability.md create mode 100644 docs/rfcs/005-zenith_local.md create mode 100644 docs/rfcs/006-laptop-cli-v2-CLI.md create mode 100644 docs/rfcs/006-laptop-cli-v2-repository-structure.md create mode 100644 docs/rfcs/007-serverless-on-laptop.md create mode 100644 docs/rfcs/008-push-pull.md create mode 100644 docs/rfcs/009-snapshot-first-storage-cli.md create mode 100644 docs/rfcs/009-snapshot-first-storage-pitr.md create mode 100644 docs/rfcs/009-snapshot-first-storage.md create mode 100644 docs/rfcs/010-storage_details.md create mode 100644 docs/rfcs/011-retention-policy.md create mode 100644 docs/rfcs/012-background-tasks.md create mode 100644 docs/rfcs/013-term-history.md create mode 100644 docs/rfcs/README.md create mode 100644 docs/rfcs/images/storage.jpeg delete mode 100644 pageserver/src/branches.rs create mode 100644 pageserver/src/timelines.rs create mode 100644 proxy/src/error.rs create mode 100644 walkeeper/src/http/models.rs create mode 100644 zenith_utils/benches/benchmarks.rs diff --git a/.circleci/config.yml b/.circleci/config.yml index db9fc31334..d342e7c9f4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -440,8 +440,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest - docker tag zenithdb/zenith:latest zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:latest # Build zenithdb/compute-node:latest image and push it to Docker hub docker-image-compute: @@ -468,8 +474,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG=$(git log --oneline|wc -l) - docker build -t zenithdb/compute-node:latest vendor/postgres && docker push zenithdb/compute-node:latest - docker tag zenithdb/compute-node:latest zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:latest # Build production zenithdb/zenith:release image and push it to Docker hub docker-image-release: @@ -487,8 +494,14 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:release . && docker push zenithdb/zenith:release - docker tag zenithdb/zenith:release zenithdb/zenith:${DOCKER_TAG} && docker push zenithdb/zenith:${DOCKER_TAG} + docker build \ + --pull \ + --build-arg GIT_VERSION=${CIRCLE_SHA1} \ + --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \ + --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \ + --tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release . + docker push zenithdb/zenith:${DOCKER_TAG} + docker push zenithdb/zenith:release # Build production zenithdb/compute-node:release image and push it to Docker hub docker-image-compute-release: @@ -515,8 +528,9 @@ jobs: command: | echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin DOCKER_TAG="release-$(git log --oneline|wc -l)" - docker build -t zenithdb/compute-node:release vendor/postgres && docker push zenithdb/compute-node:release - docker tag zenithdb/compute-node:release zenithdb/compute-node:${DOCKER_TAG} && docker push zenithdb/compute-node:${DOCKER_TAG} + docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres + docker push zenithdb/compute-node:${DOCKER_TAG} + docker push zenithdb/compute-node:release deploy-staging: docker: diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index dd23440afb..36df35297d 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -48,7 +48,7 @@ jobs: echo Python python3 --version poetry run python3 --version - echo Pipenv + echo Poetry poetry --version echo Pgbench $PG_BIN/pgbench --version diff --git a/Cargo.lock b/Cargo.lock index ba3c6729d6..750ac0edc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -260,6 +260,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5988cb1d626264ac94100be357308f29ff7cbdd3b36bda27f450a4ee3f713426" +[[package]] +name = "bstr" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" +dependencies = [ + "lazy_static", + "memchr", + "regex-automata", + "serde", +] + [[package]] name = "bumpalo" version = "3.9.1" @@ -281,6 +293,15 @@ dependencies = [ "serde", ] +[[package]] +name = "cast" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" +dependencies = [ + "rustc_version", +] + [[package]] name = "cc" version = "1.0.72" @@ -424,6 +445,7 @@ dependencies = [ "thiserror", "toml", "url", + "walkeeper", "workspace_hack", "zenith_utils", ] @@ -446,6 +468,76 @@ dependencies = [ "rustc_version", ] +[[package]] +name = "criterion" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1604dafd25fba2fe2d5895a9da139f8dc9b319a5fe5354ca137cbbce4e178d10" +dependencies = [ + "atty", + "cast", + "clap 2.34.0", + "criterion-plot", + "csv", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_cbor", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e54ea8bc3fb1ee042f5aace6e3c6e025d3874866da222930f70ce62aceba0bfa" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00d6d2ea26e8b151d99093005cb442fb9a37aeaca582a03ec70946f49ab5ed9" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "lazy_static", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.7" @@ -476,6 +568,28 @@ dependencies = [ "subtle", ] +[[package]] +name = "csv" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" +dependencies = [ + "bstr", + "csv-core", + "itoa 0.4.8", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" +dependencies = [ + "memchr", +] + [[package]] name = "daemonize" version = "0.4.1" @@ -1259,6 +1373,12 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da32515d9f6e6e489d7bc9d84c71b060db7247dc035bbe44eac88cf87486d8d5" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -1443,6 +1563,34 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "plotters" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" + +[[package]] +name = "plotters-svg" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" +dependencies = [ + "plotters-backend", +] + [[package]] name = "postgres" version = "0.19.1" @@ -1591,6 +1739,7 @@ dependencies = [ "anyhow", "bytes", "clap 3.0.14", + "fail", "futures", "hashbrown 0.11.2", "hex", @@ -1606,6 +1755,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", + "thiserror", "tokio", "tokio-postgres 0.7.1 (git+https://github.com/zenithdb/rust-postgres.git?rev=2949d98df52587d562986aad155dd4e889e408b7)", "tokio-postgres-rustls", @@ -1663,6 +1813,31 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rayon" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" +dependencies = [ + "autocfg", + "crossbeam-deque", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "lazy_static", + "num_cpus", +] + [[package]] name = "rcgen" version = "0.8.14" @@ -2232,6 +2407,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.5.1" @@ -2854,6 +3039,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "criterion", "git-version", "hex", "hex-literal", diff --git a/Dockerfile b/Dockerfile index dd0dba60ca..9ee6abaa8a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,62 +1,62 @@ +# Build Postgres # -# Docker image for console integration testing. -# +#FROM zimg/rust:1.56 AS pg-build +FROM zenithdb/build:buster-20220309 AS pg-build +WORKDIR /pg + +USER root + +COPY vendor/postgres vendor/postgres +COPY Makefile Makefile -# -# Build Postgres separately --- this layer will be rebuilt only if one of -# mentioned paths will get any changes. -# -FROM zenithdb/build:buster AS pg-build -WORKDIR /zenith -COPY ./vendor/postgres vendor/postgres -COPY ./Makefile Makefile ENV BUILD_TYPE release -RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres -RUN rm -rf postgres_install/build +RUN set -e \ + && make -j $(nproc) -s postgres \ + && rm -rf tmp_install/build \ + && tar -C tmp_install -czf /postgres_install.tar.gz . -# # Build zenith binaries # -# TODO: build cargo deps as separate layer. We used cargo-chef before but that was -# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work. -# -FROM zenithdb/build:buster AS build +#FROM zimg/rust:1.56 AS build +FROM zenithdb/build:buster-20220309 AS build +ARG GIT_VERSION=local -ARG GIT_VERSION -RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi - -WORKDIR /zenith -COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server +ARG CACHEPOT_BUCKET=zenith-rust-cachepot +ARG AWS_ACCESS_KEY_ID +ARG AWS_SECRET_ACCESS_KEY +#ENV RUSTC_WRAPPER cachepot +ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot +COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server COPY . . -RUN GIT_VERSION=$GIT_VERSION cargo build --release +RUN cargo build --release + +# Build final image # -# Copy binaries to resulting image. -# -FROM debian:buster-slim +FROM debian:bullseye-slim WORKDIR /data -RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl ca-certificates && \ - mkdir zenith_install +RUN set -e \ + && apt-get update \ + && apt-get install -y \ + libreadline-dev \ + libseccomp-dev \ + openssl \ + ca-certificates \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ + && useradd -d /data zenith \ + && chown -R zenith:zenith /data + +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin +COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin + +COPY --from=pg-build /pg/tmp_install/ /usr/local/ +COPY --from=pg-build /postgres_install.tar.gz /data/ -COPY --from=build /zenith/target/release/pageserver /usr/local/bin -COPY --from=build /zenith/target/release/safekeeper /usr/local/bin -COPY --from=build /zenith/target/release/proxy /usr/local/bin -COPY --from=pg-build /zenith/tmp_install postgres_install COPY docker-entrypoint.sh /docker-entrypoint.sh -# Remove build artifacts (~ 500 MB) -RUN rm -rf postgres_install/build && \ - # 'Install' Postgres binaries locally - cp -r postgres_install/* /usr/local/ && \ - # Prepare an archive of Postgres binaries (should be around 11 MB) - # and keep it inside container for an ease of deploy pipeline. - cd postgres_install && tar -czf /data/postgres_install.tar.gz . && cd .. && \ - rm -rf postgres_install - -RUN useradd -d /data zenith && chown -R zenith:zenith /data - VOLUME ["/data"] USER zenith EXPOSE 6400 diff --git a/Dockerfile.build b/Dockerfile.build index a9fd2cb0af..44a2aaafb9 100644 --- a/Dockerfile.build +++ b/Dockerfile.build @@ -1,16 +1,23 @@ -# -# Image with all the required dependencies to build https://github.com/zenithdb/zenith -# and Postgres from https://github.com/zenithdb/postgres -# Also includes some rust development and build tools. -# NB: keep in sync with rust image version in .circle/config.yml -# FROM rust:1.56.1-slim-buster -WORKDIR /zenith +WORKDIR /home/circleci/project -# Install postgres and zenith build dependencies -# clang is for rocksdb -RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \ - libseccomp-dev pkg-config libssl-dev clang +RUN set -e \ + && apt-get update \ + && apt-get -yq install \ + automake \ + libtool \ + build-essential \ + bison \ + flex \ + libreadline-dev \ + zlib1g-dev \ + libxml2-dev \ + libseccomp-dev \ + pkg-config \ + libssl-dev \ + clang -# Install rust tools -RUN rustup component add clippy && cargo install cargo-audit +RUN set -e \ + && rustup component add clippy \ + && cargo install cargo-audit \ + && cargo install --git https://github.com/paritytech/cachepot diff --git a/README.md b/README.md index 8dd407f41a..c8acf526b9 100644 --- a/README.md +++ b/README.md @@ -57,12 +57,12 @@ pageserver init succeeded Starting pageserver at 'localhost:64000' in '.zenith' Pageserver started initializing for single for 7676 -Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single' +Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single' Safekeeper started # start postgres compute node > ./target/debug/zenith pg start main -Starting new postgres main on main... +Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ... Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432 Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres' waiting for server to start.... done @@ -70,8 +70,8 @@ server started # check list of running postgres instances > ./target/debug/zenith pg list -BRANCH ADDRESS LSN STATUS -main 127.0.0.1:55432 0/1609610 running +NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS +main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running ``` 4. Now it is possible to connect to postgres and run some queries: @@ -91,13 +91,13 @@ postgres=# select * from t; 5. And create branches and run postgres on them: ```sh # create branch named migration_check -> ./target/debug/zenith branch migration_check main -Created branch 'migration_check' at 0/1609610 +> ./target/debug/zenith timeline branch --branch-name migration_check +Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main' # check branches tree -> ./target/debug/zenith branch - main - ┗━ @0/1609610: migration_check +> ./target/debug/zenith timeline list + main [5b014a9e41b4b63ce1a1febc04503636] + ┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9] # start postgres on that branch > ./target/debug/zenith pg start migration_check diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index 8b6dc04069..6a22b865fa 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -171,7 +171,7 @@ impl PgQuote for PgIdent { /// always quotes provided string with `""` and escapes every `"`. Not idempotent, /// i.e. if string is already escaped it will be escaped again. fn quote(&self) -> String { - let result = format!("\"{}\"", self.replace("\"", "\"\"")); + let result = format!("\"{}\"", self.replace('"', "\"\"")); result } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 41e4174bf0..1dd7c0044e 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -215,7 +215,7 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> { if let Some(r) = pg_db { // XXX: db owner name is returned as quoted string from Postgres, // when quoting is needed. - let new_owner = if r.owner.starts_with('\"') { + let new_owner = if r.owner.starts_with('"') { db.owner.quote() } else { db.owner.clone() diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 5e972200c2..eff6b3ef2d 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -17,5 +17,6 @@ url = "2.2.2" reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] } pageserver = { path = "../pageserver" } +walkeeper = { path = "../walkeeper" } zenith_utils = { path = "../zenith_utils" } workspace_hack = { path = "../workspace_hack" } diff --git a/control_plane/safekeepers.conf b/control_plane/safekeepers.conf index 828d5a5a1e..df7dd2adca 100644 --- a/control_plane/safekeepers.conf +++ b/control_plane/safekeepers.conf @@ -5,16 +5,16 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'sk1' +id = 1 pg_port = 5454 http_port = 7676 [[safekeepers]] -name = 'sk2' +id = 2 pg_port = 5455 http_port = 7677 [[safekeepers]] -name = 'sk3' +id = 3 pg_port = 5456 http_port = 7678 diff --git a/control_plane/simple.conf b/control_plane/simple.conf index 796c6adbd9..2243a0a5f8 100644 --- a/control_plane/simple.conf +++ b/control_plane/simple.conf @@ -6,6 +6,6 @@ listen_http_addr = '127.0.0.1:9898' auth_type = 'Trust' [[safekeepers]] -name = 'single' +id = 1 pg_port = 5454 http_port = 7676 diff --git a/control_plane/src/compute.rs b/control_plane/src/compute.rs index a61191e7a4..64cd46fef6 100644 --- a/control_plane/src/compute.rs +++ b/control_plane/src/compute.rs @@ -37,7 +37,7 @@ impl ComputeControlPlane { // pgdatadirs // |- tenants // | |- - // | | |- + // | | |- pub fn load(env: LocalEnv) -> Result { let pageserver = Arc::new(PageServerNode::from_env(&env)); @@ -52,7 +52,7 @@ impl ComputeControlPlane { .with_context(|| format!("failed to list {}", tenant_dir.path().display()))? { let node = PostgresNode::from_dir_entry(timeline_dir?, &env, &pageserver)?; - nodes.insert((node.tenantid, node.name.clone()), Arc::new(node)); + nodes.insert((node.tenant_id, node.name.clone()), Arc::new(node)); } } @@ -73,40 +73,14 @@ impl ComputeControlPlane { .unwrap_or(self.base_port) } - // FIXME: see also parse_point_in_time in branches.rs. - fn parse_point_in_time( - &self, - tenantid: ZTenantId, - s: &str, - ) -> Result<(ZTimelineId, Option)> { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Resolve the timeline ID, given the human-readable branch name - let timeline_id = self - .pageserver - .branch_get_by_name(&tenantid, name)? - .timeline_id; - - Ok((timeline_id, lsn)) - } - pub fn new_node( &mut self, - tenantid: ZTenantId, + tenant_id: ZTenantId, name: &str, - timeline_spec: &str, + timeline_id: ZTimelineId, + lsn: Option, port: Option, ) -> Result> { - // Resolve the human-readable timeline spec into timeline ID and LSN - let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?; - let port = port.unwrap_or_else(|| self.get_port()); let node = Arc::new(PostgresNode { name: name.to_owned(), @@ -114,9 +88,9 @@ impl ComputeControlPlane { env: self.env.clone(), pageserver: Arc::clone(&self.pageserver), is_test: false, - timelineid, + timeline_id, lsn, - tenantid, + tenant_id, uses_wal_proposer: false, }); @@ -124,7 +98,7 @@ impl ComputeControlPlane { node.setup_pg_conf(self.env.pageserver.auth_type)?; self.nodes - .insert((tenantid, node.name.clone()), Arc::clone(&node)); + .insert((tenant_id, node.name.clone()), Arc::clone(&node)); Ok(node) } @@ -139,9 +113,9 @@ pub struct PostgresNode { pub env: LocalEnv, pageserver: Arc, is_test: bool, - pub timelineid: ZTimelineId, + pub timeline_id: ZTimelineId, pub lsn: Option, // if it's a read-only node. None for primary - pub tenantid: ZTenantId, + pub tenant_id: ZTenantId, uses_wal_proposer: bool, } @@ -173,8 +147,8 @@ impl PostgresNode { // Read a few options from the config file let context = format!("in config file {}", cfg_path_str); let port: u16 = conf.parse_field("port", &context)?; - let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; - let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; + let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?; + let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?; let uses_wal_proposer = conf.get("wal_acceptors").is_some(); // parse recovery_target_lsn, if any @@ -188,9 +162,9 @@ impl PostgresNode { env: env.clone(), pageserver: Arc::clone(pageserver), is_test: false, - timelineid, + timeline_id, lsn: recovery_target_lsn, - tenantid, + tenant_id, uses_wal_proposer, }) } @@ -241,9 +215,9 @@ impl PostgresNode { ); let sql = if let Some(lsn) = lsn { - format!("basebackup {} {} {}", self.tenantid, self.timelineid, lsn) + format!("basebackup {} {} {}", self.tenant_id, self.timeline_id, lsn) } else { - format!("basebackup {} {}", self.tenantid, self.timelineid) + format!("basebackup {} {}", self.tenant_id, self.timeline_id) }; let mut client = self @@ -329,8 +303,8 @@ impl PostgresNode { conf.append("shared_preload_libraries", "zenith"); conf.append_line(""); conf.append("zenith.page_server_connstring", &pageserver_connstr); - conf.append("zenith.zenith_tenant", &self.tenantid.to_string()); - conf.append("zenith.zenith_timeline", &self.timelineid.to_string()); + conf.append("zenith.zenith_tenant", &self.tenant_id.to_string()); + conf.append("zenith.zenith_timeline", &self.timeline_id.to_string()); if let Some(lsn) = self.lsn { conf.append("recovery_target_lsn", &lsn.to_string()); } @@ -408,7 +382,7 @@ impl PostgresNode { } pub fn pgdata(&self) -> PathBuf { - self.env.pg_data_dir(&self.tenantid, &self.name) + self.env.pg_data_dir(&self.tenant_id, &self.name) } pub fn status(&self) -> &str { diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index b80e137cb9..2a1d51fe08 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -3,16 +3,20 @@ //! Now it also provides init method which acts like a stub for proper installation //! script which will use local paths. -use anyhow::{bail, Context}; +use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::env; -use std::fmt::Write; use std::fs; use std::path::{Path, PathBuf}; use std::process::{Command, Stdio}; use zenith_utils::auth::{encode_from_key_file, Claims, Scope}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{opt_display_serde, ZTenantId}; +use zenith_utils::zid::{ + HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId, +}; + +use crate::safekeeper::SafekeeperNode; // // This data structures represents zenith CLI config @@ -21,7 +25,7 @@ use zenith_utils::zid::{opt_display_serde, ZTenantId}; // to 'zenith init --config=' option. See control_plane/simple.conf for // an example. // -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] pub struct LocalEnv { // Base directory for all the nodes (the pageserver, safekeepers and // compute nodes). @@ -45,9 +49,8 @@ pub struct LocalEnv { // Default tenant ID to use with the 'zenith' command line utility, when // --tenantid is not explicitly specified. - #[serde(with = "opt_display_serde")] #[serde(default)] - pub default_tenantid: Option, + pub default_tenant_id: Option, // used to issue tokens during e.g pg start #[serde(default)] @@ -57,11 +60,20 @@ pub struct LocalEnv { #[serde(default)] pub safekeepers: Vec, + + /// Keep human-readable aliases in memory (and persist them to config), to hide ZId hex strings from the user. + #[serde(default)] + // A `HashMap>` would be more appropriate here, + // but deserialization into a generic toml object as `toml::Value::try_from` fails with an error. + // https://toml.io/en/v1.0.0 does not contain a concept of "a table inside another table". + branch_name_mappings: HashMap>, } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct PageServerConf { + // node id + pub id: ZNodeId, // Pageserver connection settings pub listen_pg_addr: String, pub listen_http_addr: String, @@ -76,6 +88,7 @@ pub struct PageServerConf { impl Default for PageServerConf { fn default() -> Self { Self { + id: ZNodeId(0), listen_pg_addr: String::new(), listen_http_addr: String::new(), auth_type: AuthType::Trust, @@ -84,10 +97,10 @@ impl Default for PageServerConf { } } -#[derive(Serialize, Deserialize, Clone, Debug)] +#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] pub struct SafekeeperConf { - pub name: String, + pub id: ZNodeId, pub pg_port: u16, pub http_port: u16, pub sync: bool, @@ -96,7 +109,7 @@ pub struct SafekeeperConf { impl Default for SafekeeperConf { fn default() -> Self { Self { - name: String::new(), + id: ZNodeId(0), pg_port: 0, http_port: 0, sync: true, @@ -136,8 +149,74 @@ impl LocalEnv { self.base_data_dir.clone() } - pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf { - self.base_data_dir.join("safekeepers").join(node_name) + pub fn safekeeper_data_dir(&self, data_dir_name: &str) -> PathBuf { + self.base_data_dir.join("safekeepers").join(data_dir_name) + } + + pub fn register_branch_mapping( + &mut self, + branch_name: String, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + ) -> anyhow::Result<()> { + let existing_values = self + .branch_name_mappings + .entry(branch_name.clone()) + .or_default(); + + let tenant_id = HexZTenantId::from(tenant_id); + let timeline_id = HexZTimelineId::from(timeline_id); + + let existing_ids = existing_values + .iter() + .find(|(existing_tenant_id, _)| existing_tenant_id == &tenant_id); + + if let Some((_, old_timeline_id)) = existing_ids { + if old_timeline_id == &timeline_id { + Ok(()) + } else { + bail!( + "branch '{}' is already mapped to timeline {}, cannot map to another timeline {}", + branch_name, + old_timeline_id, + timeline_id + ); + } + } else { + existing_values.push((tenant_id, timeline_id)); + Ok(()) + } + } + + pub fn get_branch_timeline_id( + &self, + branch_name: &str, + tenant_id: ZTenantId, + ) -> Option { + let tenant_id = HexZTenantId::from(tenant_id); + self.branch_name_mappings + .get(branch_name)? + .iter() + .find(|(mapped_tenant_id, _)| mapped_tenant_id == &tenant_id) + .map(|&(_, timeline_id)| timeline_id) + .map(ZTimelineId::from) + } + + pub fn timeline_name_mappings(&self) -> HashMap { + self.branch_name_mappings + .iter() + .flat_map(|(name, tenant_timelines)| { + tenant_timelines.iter().map(|&(tenant_id, timeline_id)| { + ( + ZTenantTimelineId::new( + ZTenantId::from(tenant_id), + ZTimelineId::from(timeline_id), + ), + name.clone(), + ) + }) + }) + .collect() } /// Create a LocalEnv from a config file. @@ -179,8 +258,8 @@ impl LocalEnv { } // If no initial tenant ID was given, generate it. - if env.default_tenantid.is_none() { - env.default_tenantid = Some(ZTenantId::generate()); + if env.default_tenant_id.is_none() { + env.default_tenant_id = Some(HexZTenantId::from(ZTenantId::generate())); } env.base_data_dir = base_path(); @@ -210,6 +289,39 @@ impl LocalEnv { Ok(env) } + pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> { + // Currently, the user first passes a config file with 'zenith init --config=' + // We read that in, in `create_config`, and fill any missing defaults. Then it's saved + // to .zenith/config. TODO: We lose any formatting and comments along the way, which is + // a bit sad. + let mut conf_content = r#"# This file describes a locale deployment of the page server +# and safekeeeper node. It is read by the 'zenith' command-line +# utility. +"# + .to_string(); + + // Convert the LocalEnv to a toml file. + // + // This could be as simple as this: + // + // conf_content += &toml::to_string_pretty(env)?; + // + // But it results in a "values must be emitted before tables". I'm not sure + // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. + // Maybe rust reorders the fields to squeeze avoid padding or something? + // In any case, converting to toml::Value first, and serializing that, works. + // See https://github.com/alexcrichton/toml-rs/issues/142 + conf_content += &toml::to_string_pretty(&toml::Value::try_from(self)?)?; + + let target_config_path = base_path.join("config"); + fs::write(&target_config_path, conf_content).with_context(|| { + format!( + "Failed to write config file into path '{}'", + target_config_path.display() + ) + }) + } + // this function is used only for testing purposes in CLI e g generate tokens during init pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { let private_key_path = if self.private_key_path.is_absolute() { @@ -228,15 +340,15 @@ impl LocalEnv { pub fn init(&mut self) -> anyhow::Result<()> { // check if config already exists let base_path = &self.base_data_dir; - if base_path == Path::new("") { - bail!("repository base path is missing"); - } - if base_path.exists() { - bail!( - "directory '{}' already exists. Perhaps already initialized?", - base_path.to_str().unwrap() - ); - } + ensure!( + base_path != Path::new(""), + "repository base path is missing" + ); + ensure!( + !base_path.exists(), + "directory '{}' already exists. Perhaps already initialized?", + base_path.display() + ); fs::create_dir(&base_path)?; @@ -285,39 +397,10 @@ impl LocalEnv { fs::create_dir_all(self.pg_data_dirs_path())?; for safekeeper in &self.safekeepers { - fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?; + fs::create_dir_all(SafekeeperNode::datadir_path_by_id(self, safekeeper.id))?; } - let mut conf_content = String::new(); - - // Currently, the user first passes a config file with 'zenith init --config=' - // We read that in, in `create_config`, and fill any missing defaults. Then it's saved - // to .zenith/config. TODO: We lose any formatting and comments along the way, which is - // a bit sad. - write!( - &mut conf_content, - r#"# This file describes a locale deployment of the page server -# and safekeeeper node. It is read by the 'zenith' command-line -# utility. -"# - )?; - - // Convert the LocalEnv to a toml file. - // - // This could be as simple as this: - // - // conf_content += &toml::to_string_pretty(env)?; - // - // But it results in a "values must be emitted before tables". I'm not sure - // why, AFAICS the table, i.e. 'safekeepers: Vec' is last. - // Maybe rust reorders the fields to squeeze avoid padding or something? - // In any case, converting to toml::Value first, and serializing that, works. - // See https://github.com/alexcrichton/toml-rs/issues/142 - conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?; - - fs::write(base_path.join("config"), conf_content)?; - - Ok(()) + self.persist_config(base_path) } } diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index f5478b5922..969e2cd531 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -14,7 +14,9 @@ use postgres::Config; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; +use walkeeper::http::models::TimelineCreateRequest; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use crate::local_env::{LocalEnv, SafekeeperConf}; use crate::storage::PageServerNode; @@ -61,7 +63,7 @@ impl ResponseErrorMessageExt for Response { // #[derive(Debug)] pub struct SafekeeperNode { - pub name: String, + pub id: ZNodeId, pub conf: SafekeeperConf, @@ -77,10 +79,10 @@ impl SafekeeperNode { pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode { let pageserver = Arc::new(PageServerNode::from_env(env)); - println!("initializing for {} for {}", conf.name, conf.http_port); + println!("initializing for sk {} for {}", conf.id, conf.http_port); SafekeeperNode { - name: conf.name.clone(), + id: conf.id, conf: conf.clone(), pg_connection_config: Self::safekeeper_connection_config(conf.pg_port), env: env.clone(), @@ -98,8 +100,12 @@ impl SafekeeperNode { .unwrap() } + pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf { + env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref()) + } + pub fn datadir_path(&self) -> PathBuf { - self.env.safekeeper_data_dir(&self.name) + SafekeeperNode::datadir_path_by_id(&self.env, self.id) } pub fn pid_file(&self) -> PathBuf { @@ -120,6 +126,7 @@ impl SafekeeperNode { let mut cmd = Command::new(self.env.safekeeper_bin()?); fill_rust_env_vars( cmd.args(&["-D", self.datadir_path().to_str().unwrap()]) + .args(&["--id", self.id.to_string().as_ref()]) .args(&["--listen-pg", &listen_pg]) .args(&["--listen-http", &listen_http]) .args(&["--recall", "1 second"]) @@ -183,7 +190,7 @@ impl SafekeeperNode { pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { let pid_file = self.pid_file(); if !pid_file.exists() { - println!("Safekeeper {} is already stopped", self.name); + println!("Safekeeper {} is already stopped", self.id); return Ok(()); } let pid = read_pidfile(&pid_file)?; @@ -255,4 +262,25 @@ impl SafekeeperNode { .error_from_body()?; Ok(()) } + + pub fn timeline_create( + &self, + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + peer_ids: Vec, + ) -> Result<()> { + Ok(self + .http_request( + Method::POST, + format!("{}/{}", self.http_base_url, "timeline"), + ) + .json(&TimelineCreateRequest { + tenant_id, + timeline_id, + peer_ids, + }) + .send()? + .error_from_body()? + .json()?) + } } diff --git a/control_plane/src/storage.rs b/control_plane/src/storage.rs index be594889ab..f6b7173067 100644 --- a/control_plane/src/storage.rs +++ b/control_plane/src/storage.rs @@ -1,3 +1,4 @@ +use std::convert::TryFrom; use std::io::Write; use std::net::TcpStream; use std::path::PathBuf; @@ -5,22 +6,23 @@ use std::process::Command; use std::time::Duration; use std::{io, result, thread}; -use anyhow::bail; +use anyhow::{bail, Context}; use nix::errno::Errno; use nix::sys::signal::{kill, Signal}; use nix::unistd::Pid; -use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest}; +use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse}; +use pageserver::timelines::TimelineInfo; use postgres::{Config, NoTls}; use reqwest::blocking::{Client, RequestBuilder, Response}; use reqwest::{IntoUrl, Method}; use thiserror::Error; use zenith_utils::http::error::HttpErrorBody; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::ZTenantId; +use zenith_utils::zid::{HexZTenantId, HexZTimelineId, ZTenantId, ZTimelineId}; use crate::local_env::LocalEnv; use crate::{fill_rust_env_vars, read_pidfile}; -use pageserver::branches::BranchInfo; use pageserver::tenant_mgr::TenantInfo; use zenith_utils::connstring::connection_address; @@ -98,11 +100,14 @@ impl PageServerNode { pub fn init( &self, - create_tenant: Option<&str>, + create_tenant: Option, + initial_timeline_id: Option, config_overrides: &[&str], - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let mut cmd = Command::new(self.env.pageserver_bin()?); + let id = format!("id={}", self.env.pageserver.id); + // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc. let base_data_dir_param = self.env.base_data_dir.display().to_string(); let pg_distrib_dir_param = @@ -122,6 +127,7 @@ impl PageServerNode { args.extend(["-c", &authg_type_param]); args.extend(["-c", &listen_http_addr_param]); args.extend(["-c", &listen_pg_addr_param]); + args.extend(["-c", &id]); for config_override in config_overrides { args.extend(["-c", config_override]); @@ -134,19 +140,24 @@ impl PageServerNode { ]); } - if let Some(tenantid) = create_tenant { - args.extend(["--create-tenant", tenantid]) + let create_tenant = create_tenant.map(|id| id.to_string()); + if let Some(tenant_id) = create_tenant.as_deref() { + args.extend(["--create-tenant", tenant_id]) } - let status = fill_rust_env_vars(cmd.args(args)) - .status() - .expect("pageserver init failed"); + let initial_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + let initial_timeline_id_string = initial_timeline_id.to_string(); + args.extend(["--initial-timeline-id", &initial_timeline_id_string]); - if !status.success() { + let init_output = fill_rust_env_vars(cmd.args(args)) + .output() + .context("pageserver init failed")?; + + if !init_output.status.success() { bail!("pageserver init failed"); } - Ok(()) + Ok(initial_timeline_id) } pub fn repo_path(&self) -> PathBuf { @@ -307,7 +318,7 @@ impl PageServerNode { } pub fn check_status(&self) -> Result<()> { - self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status")) + self.http_request(Method::GET, format!("{}/status", self.http_base_url)) .send()? .error_from_body()?; Ok(()) @@ -315,64 +326,76 @@ impl PageServerNode { pub fn tenant_list(&self) -> Result> { Ok(self - .http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant")) + .http_request(Method::GET, format!("{}/tenant", self.http_base_url)) .send()? .error_from_body()? .json()?) } - pub fn tenant_create(&self, tenantid: ZTenantId) -> Result<()> { - Ok(self - .http_request(Method::POST, format!("{}/{}", self.http_base_url, "tenant")) + pub fn tenant_create( + &self, + new_tenant_id: Option, + ) -> anyhow::Result> { + let tenant_id_string = self + .http_request(Method::POST, format!("{}/tenant", self.http_base_url)) .json(&TenantCreateRequest { - tenant_id: tenantid, + new_tenant_id: new_tenant_id.map(HexZTenantId::from), }) .send()? .error_from_body()? - .json()?) + .json::>()?; + + tenant_id_string + .map(|id| { + id.parse().with_context(|| { + format!( + "Failed to parse tennat creation response as tenant id: {}", + id + ) + }) + }) + .transpose() } - pub fn branch_list(&self, tenantid: &ZTenantId) -> Result> { - Ok(self + pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result> { + let timeline_infos: Vec = self .http_request( Method::GET, - format!("{}/branch/{}", self.http_base_url, tenantid), + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), ) .send()? .error_from_body()? - .json()?) + .json()?; + + timeline_infos + .into_iter() + .map(TimelineInfo::try_from) + .collect() } - pub fn branch_create( + pub fn timeline_create( &self, - branch_name: &str, - startpoint: &str, - tenantid: &ZTenantId, - ) -> Result { - Ok(self - .http_request(Method::POST, format!("{}/branch", self.http_base_url)) - .json(&BranchCreateRequest { - tenant_id: tenantid.to_owned(), - name: branch_name.to_owned(), - start_point: startpoint.to_owned(), + tenant_id: ZTenantId, + new_timeline_id: Option, + ancestor_start_lsn: Option, + ancestor_timeline_id: Option, + ) -> anyhow::Result> { + let timeline_info_response = self + .http_request( + Method::POST, + format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id), + ) + .json(&TimelineCreateRequest { + new_timeline_id: new_timeline_id.map(HexZTimelineId::from), + ancestor_start_lsn, + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), }) .send()? .error_from_body()? - .json()?) - } + .json::>()?; - pub fn branch_get_by_name( - &self, - tenantid: &ZTenantId, - branch_name: &str, - ) -> Result { - Ok(self - .http_request( - Method::GET, - format!("{}/branch/{}/{}", self.http_base_url, tenantid, branch_name), - ) - .send()? - .error_for_status()? - .json()?) + timeline_info_response + .map(TimelineInfo::try_from) + .transpose() } } diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh index 45c41b4c19..93bb5f9cd7 100755 --- a/docker-entrypoint.sh +++ b/docker-entrypoint.sh @@ -4,7 +4,7 @@ set -eux if [ "$1" = 'pageserver' ]; then if [ ! -d "/data/tenants" ]; then echo "Initializing pageserver data directory" - pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" + pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" fi echo "Staring pageserver at 0.0.0.0:6400" pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data diff --git a/docs/docker.md b/docs/docker.md index 14ba2146cb..cc54d012dd 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -7,32 +7,14 @@ Currently we build two main images: - [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). - [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres). -And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos: +And additional intermediate images: -- [zenithdb/build](https://hub.docker.com/repository/docker/zenithdb/build) — image with all the dependencies required to build Zenith and compute node images. This image is based on `rust:slim-buster`, so it also has a proper `rust` environment. Built from [/Dockerfile.build](/Dockerfile.build). - [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools. ## Building pipeline 1. Image `zenithdb/compute-tools` is re-built automatically. -2. Image `zenithdb/build` is built manually. If you want to introduce any new compile time dependencies to Zenith or compute node you have to update this image as well, build it and push to Docker Hub. +2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. -Build: -```sh -docker build -t zenithdb/build:buster -f Dockerfile.build . -``` - -Login: -```sh -docker login -``` - -Push to Docker Hub: -```sh -docker push zenithdb/build:buster -``` - -3. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo. - -4. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. +3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically. diff --git a/docs/rfcs/002-storage.md b/docs/rfcs/002-storage.md new file mode 100644 index 0000000000..5cac377272 --- /dev/null +++ b/docs/rfcs/002-storage.md @@ -0,0 +1,186 @@ +# Zenith storage node — alternative + +## **Design considerations** + +Simplify storage operations for people => Gain adoption/installs on laptops and small private installation => Attract customers to DBaaS by seamless integration between our tooling and cloud. + +Proposed architecture addresses: + +- High availability -- tolerates n/2 - 1 failures +- Multi-tenancy -- one storage for all databases +- Elasticity -- increase storage size on the go by adding nodes +- Snapshots / backups / PITR with S3 offload +- Compression + +Minuses are: + +- Quite a lot of work +- Single page access may touch few disk pages +- Some bloat in data — may slowdown sequential scans + +## **Summary** + +Storage cluster is sharded key-value store with ordered keys. Key (****page_key****) is a tuple of `(pg_id, db_id, timeline_id, rel_id, forkno, segno, pageno, lsn)`. Value is either page or page diff/wal. Each chunk (chunk == shard) stores approx 50-100GB ~~and automatically splits in half when grows bigger then soft 100GB limit~~. by having a fixed range of pageno's it is responsible for. Chunks placement on storage nodes is stored in a separate metadata service, so chunk can be freely moved around the cluster if it is need. Chunk itself is a filesystem directory with following sub directories: + +``` + +|-chunk_42/ + |-store/ -- contains lsm with pages/pagediffs ranging from + | page_key_lo to page_key_hi + |-wal/ + | |- db_1234/ db-specific wal files with pages from page_key_lo + | to page_key_hi + | + |-chunk.meta -- small file with snapshot references + (page_key_prefix+lsn+name) + and PITR regions (page_key_start, page_key_end) +``` + +## **Chunk** + +Chunk is responsible for storing pages potentially from different databases and relations. Each page is addressed by a lexicographically ordered tuple (****page_key****) with following fields: + +- `pg_id` -- unique id of given postgres instance (or postgres cluster as it is called in postgres docs) +- `db_id` -- database that was created by 'CREATE DATABASE' in a given postgres instance +- `db_timeline` -- used to create Copy-on-Write instances from snapshots, described later +- `rel_id` -- tuple of (relation_id, 0) for tables and (indexed_relation_id, rel_id) for indices. Done this way so table indices were closer to table itself on our global key space. +- `(forkno, segno, pageno)` -- page coordinates in postgres data files +- `lsn_timeline` -- postgres feature, increments when PITR was done. +- `lsn` -- lsn of current page version. + +Chunk stores pages and page diffs ranging from page_key_lo to page_key_hi. Processing node looks at page in wal record and sends record to a chunk responsible for this page range. When wal record arrives to a chunk it is initially stored in `chunk_id/wal/db_id/wal_segno.wal`. Then background process moves records from that wal files to the lsm tree in `chunk_id/store`. Or, more precisely, wal records would be materialized into lsm memtable and when that memtable is flushed to SSTable on disk we may trim the wal. That way some not durably (in the distributed sense) committed pages may enter the tree -- here we rely on processing node behavior: page request from processing node should contain proper lsm horizons so that storage node may respond with proper page version. + +LSM here is a usual LSM for variable-length values: at first data is stored in memory (we hold incoming wal records to be able to regenerate it after restart) at some balanced tree. When this tree grows big enough we dump it into disk file (SSTable) sorting records by key. Then SStables are mergesorted in the background to a different files. All file operation are sequential and do not require WAL for durability. + +Content of SSTable can be following: + +```jsx +(pg_id, db_id, ... , pageno=42, lsn=100) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=150) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=180) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=200) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=220) (full 8k page data) +(pg_id, db_id, ... , pageno=42, lsn=250) (per-page diff) +(pg_id, db_id, ... , pageno=42, lsn=270) (per-page diff) +(pg_id, db_id, ... , pageno=5000, lsn=100) (full 8k page data) +``` + +So query for `pageno=42 up to lsn=260` would need to find closest entry less then this key, iterate back to the latest full page and iterate forward to apply diffs. How often page is materialized in lsn-version sequence is up to us -- let's say each 5th version should be a full page. + +### **Page deletion** + +To delete old pages we insert blind deletion marker `(pg_id, db_id, #trim_lsn < 150)` into a lsm tree. During merges such marker would indicate that all pages with smaller lsn should be discarded. Delete marker will travel down the tree levels hierarchy until it reaches last level. In non-PITR scenario where old page version are not needed at all such deletion marker would (in average) prevent old page versions propagation down the tree -- so all bloat would concentrate at higher tree layers without affecting bigger bottom layers. + +### **Recovery** + +Upon storage node restart recent WAL files are applied to appropriate pages and resulting pages stored in lsm memtable. So this should be fast since we are not writing anything to disk. + +### **Checkpointing** + +No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer. + +### **Full page writes (torn page protection)** + +Storage node never updates individual pages, only merges SSTable, so torn pages is not an issue. + +### **Snapshot** + +That is the part that I like about this design -- snapshot creation is instant and cheap operation that can have flexible granularity level: whole instance, database, table. Snapshot creation inserts a record in `chunk.meta` file with lsn of this snapshot and key prefix `(pg_id, db_id, db_timeline, rel_id, *)` that prohibits pages deletion within this range. Storage node may not know anything about page internals, but by changing number of fields in our prefix we may change snapshot granularity. + +It is again useful to remap `rel_id` to `(indexed_relation_id, rel_id)` so that snapshot of relation would include it's indices. Also table snapshot would trickily interact with catalog. Probably all table snapshots should hold also a catalog snapshot. And when node is started with such snapshot it should check that only tables from snapshot are queried. I assume here that for snapshot reading one need to start a new postgres instance. + +Storage consumed by snapshot is proportional to the amount of data changed. We may have some heuristic (calculated based on cost of different storages) about when to offload old snapshot to s3. For example, if current database has more then 40% of changed pages with respect to previous snapshot then we may offload that snapshot to s3, and release this space. + +**Starting db from snapshot** + +When we are starting database from snapshot it can be done in two ways. First, we may create new db_id, move all the data from snapshot to a new db and start a database. Second option is to create Copy-on-Write (CoW) instance out of snapshot and read old pages from old snapshot and store new pages separately. That is why there is `db_timeline` key field near `db_id` -- CoW (🐮) database should create new `db_timeline` and remember old `db_timeline`. Such a database can have hashmap of pages that it is changed to query pages from proper snapshot on the first try. `db_timeline` is located near `db_id` so that new page versions generated by new instance would not bloat data of initial snapshot. It is not clear for whether it is possibly to effectively support "stacked" CoW snapshot, so we may disallow them. (Well, one way to support them is to move `db_timeline` close to `lsn` -- so we may scan neighboring pages and find right one. But again that way we bloat snapshot with unrelated data and may slowdown full scans that are happening in different database). + +**Snapshot export/import** + +Once we may start CoW instances it is easy to run auxiliary postgres instance on this snapshot and run `COPY FROM (...) TO stdout` or `pg_dump` and export data from the snapshot to some portable formats. Also we may start postgres on a new empty database and run `COPY FROM stdin`. This way we can initialize new non-CoW databases and transfer snapshots via network. + +### **PITR area** + +In described scheme PITR is just a prohibition to delete any versions within some key prefix, either it is a database or a table key prefix. So PITR may have different settings for different tables, databases, etc. + +PITR is quite bloaty, so we may aggressively offload it to s3 -- we may push same (or bigger) SSTables to s3 and maintain lsm structure there. + +### **Compression** + +Since we are storing page diffs of variable sizes there is no structural dependency on a page size and we may compress it. Again that could be enabled only on pages with some key prefixes, so we may have this with db/table granularity. + +### **Chunk metadata** + +Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers. + +### **Chunk splitting** + +*(NB: following paragraph is about how to avoid page splitting)* + +When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following: + +1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries. + +2. Prohibit WAL deletion and old SSTables deletion on original chunk. + +3. On each lsm layer we would need to split only one SSTable, all other would fit within left or right range. Symlink/split that files to new chunks. + +4. Start WAL replay on new chunks. + +5. Update global metadata about new chunk boundaries. + +6. Eventually (metadata update should be pushed to processing node by metadata service) storage node will start sending WAL and page requests to the new nodes. + +7. New chunk may start serving read queries when following conditions are met: + +a) it receives at least on WAL record from processing node + +b) it replayed all WAL up to the new received one + +c) checked by downlinks that there were no WAL gaps. + +Chunk split as it is described here is quite fast operation when it is happening on the local disk -- vast majority of files will be just moved without copying anything. I suggest to keep split always local and not to mix it with chunk moving around cluster. So if we want to split some chunk but there is small amount of free space left on the device, we should first move some chunks away from the node and then proceed with splitting. + +### Fixed chunks + +Alternative strategy is to not to split at all and have pageno-fixed chunk boundaries. When table is created we first materialize this chunk by storing first new pages only and chunks is small. Then chunk is growing while table is filled, but it can't grow substantially bigger then allowed pageno range, so at max it would be 1GB or whatever limit we want + some bloat due to snapshots and old page versions. + +### **Chunk lsm internals** + +So how to implement chunk's lsm? + +- Write from scratch and use RocksDB to prototype/benchmark, then switch to own lsm implementation. RocksDB can provide some sanity check for performance of home-brewed implementation and it would be easier to prototype. +- Use postgres as lego constructor. We may model memtable with postgres B-tree referencing some in-memory log of incoming records. SSTable merging may reuse postgres external merging algorithm, etc. One thing that would definitely not fit (or I didn't came up with idea how to fit that) -- is multi-tenancy. If we are storing pages from different databases we can't use postgres buffer pool, since there is no db_id in the page header. We can add new field there but IMO it would be no go for committing that to vanilla. + +Other possibility is to not to try to fit few databases in one storage node. But that way it is no go for multi-tenant cloud installation: we would need to run a lot of storage node instances on one physical storage node, all with it own local page cache. So that would be much closer to ordinary managed RDS. + +Multi-tenant storage makes sense even on a laptop, when you work with different databases, running tests with temp database, etc. And when installation grows bigger it start to make more and more sense, so it seems important. + +# Storage fleet + +# **Storage fleet** + +- When database is smaller then a chunk size we naturally can store them in one chunk (since their page_key would fit in some chunk's [hi, lo) range). + +Screenshot_2021-02-22_at_16 49 17 + +Few databases are stored in one chunk, replicated three times + +- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster. + +Screenshot_2021-02-22_at_16 49 10 + +Here one big database occupies two set of nodes. Also some chunks were moved around to restore replication factor after disk failure. In this case we also have "sharded" storage for a big database and issue wal writes to different chunks in parallel. + +## **Chunk placement strategies** + +There are few scenarios where we may want to move chunks around the cluster: + +- disk usage on some node is big +- some disk experienced a failure +- some node experienced a failure or need maintenance + +## **Chunk replication** + +Chunk replication may be done by cloning page ranges with respect to some lsn from peer nodes, updating global metadata, waiting for WAL to come, replaying previous WAL and becoming online -- more or less like during chunk split. + diff --git a/docs/rfcs/003-laptop-cli.md b/docs/rfcs/003-laptop-cli.md new file mode 100644 index 0000000000..4d1f0a68f0 --- /dev/null +++ b/docs/rfcs/003-laptop-cli.md @@ -0,0 +1,267 @@ +# Command line interface (end-user) + +Zenith CLI as it is described here mostly resides on the same conceptual level as pg_ctl/initdb/pg_recvxlog/etc and replaces some of them in an opinionated way. I would also suggest bundling our patched postgres inside zenith distribution at least at the start. + +This proposal is focused on managing local installations. For cluster operations, different tooling would be needed. The point of integration between the two is storage URL: no matter how complex cluster setup is it may provide an endpoint where the user may push snapshots. + +The most important concept here is a snapshot, which can be created/pushed/pulled/exported. Also, we may start temporary read-only postgres instance over any local snapshot. A more complex scenario would consist of several basic operations over snapshots. + +# Possible usage scenarios + +## Install zenith, run a postgres + +``` +> brew install pg-zenith +> zenith pg create # creates pgdata with default pattern pgdata$i +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 0G zenith-local localhost:5432 +``` + +## Import standalone postgres to zenith + +``` +> zenith snapshot import --from=basebackup://replication@localhost:5432/ oldpg +[====================------------] 60% | 20MB/s +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - + +> zenith pg create --snapshot oldpg +Started postgres on localhost:5432 + +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot destroy oldpg +Ok +``` + +Also, we may start snapshot import implicitly by looking at snapshot schema + +``` +> zenith pg create --snapshot basebackup://replication@localhost:5432/ +Downloading snapshot... Done. +Started postgres on localhost:5432 +Destroying snapshot... Done. +``` + +## Pull snapshot with some publicly shared database + +Since we may export the whole snapshot as one big file (tar of basebackup, maybe with some manifest) it may be shared over conventional means: http, ssh, [git+lfs](https://docs.github.com/en/github/managing-large-files/about-git-large-file-storage). + +``` +> zenith pg create --snapshot http://learn-postgres.com/movies_db.zenith movies +``` + +## Create snapshot and push it to the cloud + +``` +> zenith snapshot create pgdata1@snap1 +> zenith snapshot push --to ssh://stas@zenith.tech pgdata1@snap1 +``` + +## Rollback database to the snapshot + +One way to rollback the database is just to init a new database from the snapshot and destroy the old one. But creating a new database from a snapshot would require a copy of that snapshot which is time consuming operation. Another option that would be cool to support is the ability to create the copy-on-write database from the snapshot without copying data, and store updated pages in a separate location, however that way would have performance implications. So to properly rollback the database to the older state we have `zenith pg checkout`. + +``` +> zenith pg list +ID PGDATA USED STORAGE ENDPOINT +primary1 pgdata1 5G zenith-local localhost:5432 + +> zenith snapshot create pgdata1@snap1 + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@CURRENT 6G - + +> zenith pg checkout pgdata1@snap1 +Stopping postgres on pgdata1. +Rolling back pgdata1@CURRENT to pgdata1@snap1. +Starting postgres on pgdata1. + +> zenith snapshot list +ID SIZE PARENT +oldpg 5G - +pgdata1@snap1 6G - +pgdata1@HEAD{0} 6G - +pgdata1@CURRENT 6G - +``` + +Some notes: pgdata1@CURRENT -- implicit snapshot representing the current state of the database in the data directory. When we are checking out some snapshot CURRENT will be set to this snapshot and the old CURRENT state will be named HEAD{0} (0 is the number of postgres timeline, it would be incremented after each such checkout). + +## Configure PITR area (Point In Time Recovery). + +PITR area acts like a continuous snapshot where you can reset the database to any point in time within this area (by area I mean some TTL period or some size limit, both possibly infinite). + +``` +> zenith pitr create --storage s3tank --ttl 30d --name pitr_last_month +``` + +Resetting the database to some state in past would require creating a snapshot on some lsn / time in this pirt area. + +# Manual + +## storage + +Storage is either zenith pagestore or s3. Users may create a database in a pagestore and create/move *snapshots* and *pitr regions* in both pagestore and s3. Storage is a concept similar to `git remote`. After installation, I imagine one local storage is available by default. + +**zenith storage attach** -t [native|s3] -c key=value -n name + +Attaches/initializes storage. For --type=s3, user credentials and path should be provided. For --type=native we may support --path=/local/path and --url=zenith.tech/stas/mystore. Other possible term for native is 'zstore'. + + +**zenith storage list** + +Show currently attached storages. For example: + +``` +> zenith storage list +NAME USED TYPE OPTIONS PATH +local 5.1G zenith-local /opt/zenith/store/local +local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr +zcloud 60G zenith-remote zenith.tech/stas/mystore +s3tank 80G S3 +``` + +**zenith storage detach** + +**zenith storage show** + + + +## pg + +Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself. + +Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together. + +**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata + +Creates (initializes) new data directory in given storage and starts postgres. I imagine that storage for this operation may be only local and data movement to remote location happens through snapshots/pitr. + +--no-start: just init datadir without creating + +--snapshot snap: init from the snapshot. Snap is a name or URL (zenith.tech/stas/mystore/snap1) + +--cow: initialize Copy-on-Write data directory on top of some snapshot (makes sense if it is a snapshot of currently running a database) + +**zenith pg destroy** + +**zenith pg start** [--replica] pgdata + +Start postgres with proper extensions preloaded/installed. + +**zenith pg checkout** + +Rollback data directory to some previous snapshot. + +**zenith pg stop** pg_id + +**zenith pg list** + +``` +ROLE PGDATA USED STORAGE ENDPOINT +primary my_pg 5.1G local localhost:5432 +replica-1 localhost:5433 +replica-2 localhost:5434 +primary my_pg2 3.2G local.compr localhost:5435 +- my_pg3 9.2G local.compr - +``` + +**zenith pg show** + +``` +my_pg: + storage: local + space used on local: 5.1G + space used on all storages: 15.1G + snapshots: + on local: + snap1: 1G + snap2: 1G + on zcloud: + snap2: 1G + on s3tank: + snap5: 2G + pitr: + on s3tank: + pitr_one_month: 45G + +``` + +**zenith pg start-rest/graphql** pgdata + +Starts REST/GraphQL proxy on top of postgres master. Not sure we should do that, just an idea. + + +## snapshot + +Snapshot creation is cheap -- no actual data is copied, we just start retaining old pages. Snapshot size means the amount of retained data, not all data. Snapshot name looks like pgdata_name@tag_name. tag_name is set by the user during snapshot creation. There are some reserved tag names: CURRENT represents the current state of the data directory; HEAD{i} represents the data directory state that resided in the database before i-th checkout. + +**zenith snapshot create** pgdata_name@snap_name + +Creates a new snapshot in the same storage where pgdata_name exists. + +**zenith snapshot push** --to url pgdata_name@snap_name + +Produces binary stream of a given snapshot. Under the hood starts temp read-only postgres over this snapshot and sends basebackup stream. Receiving side should start `zenith snapshot recv` before push happens. If url has some special schema like zenith:// receiving side may require auth start `zenith snapshot recv` on the go. + +**zenith snapshot recv** + +Starts a port listening for a basebackup stream, prints connection info to stdout (so that user may use that in push command), and expects data on that socket. + +**zenith snapshot pull** --from url or path + +Connects to a remote zenith/s3/file and pulls snapshot. The remote site should be zenith service or files in our format. + +**zenith snapshot import** --from basebackup://<...> or path + +Creates a new snapshot out of running postgres via basebackup protocol or basebackup files. + +**zenith snapshot export** + +Starts read-only postgres over this snapshot and exports data in some format (pg_dump, or COPY TO on some/all tables). One of the options may be zenith own format which is handy for us (but I think just tar of basebackup would be okay). + +**zenith snapshot diff** snap1 snap2 + +Shows size of data changed between two snapshots. We also may provide options to diff schema/data in tables. To do that start temp read-only postgreses. + +**zenith snapshot destroy** + +## pitr + +Pitr represents wal stream and ttl policy for that stream + +XXX: any suggestions on a better name? + +**zenith pitr create** name + +--ttl = inf | period + +--size-limit = inf | limit + +--storage = storage_name + +**zenith pitr extract-snapshot** pitr_name --lsn xxx + +Creates a snapshot out of some lsn in PITR area. The obtained snapshot may be managed with snapshot routines (move/send/export) + +**zenith pitr gc** pitr_name + +Force garbage collection on some PITR area. + +**zenith pitr list** + +**zenith pitr destroy** + + +## console + +**zenith console** + +Opens browser targeted at web console with the more or less same functionality as described here. diff --git a/docs/rfcs/004-durability.md b/docs/rfcs/004-durability.md new file mode 100644 index 0000000000..4543be3dae --- /dev/null +++ b/docs/rfcs/004-durability.md @@ -0,0 +1,218 @@ +Durability & Consensus +====================== + +When a transaction commits, a commit record is generated in the WAL. +When do we consider the WAL record as durable, so that we can +acknowledge the commit to the client and be reasonably certain that we +will not lose the transaction? + +Zenith uses a group of WAL safekeeper nodes to hold the generated WAL. +A WAL record is considered durable, when it has been written to a +majority of WAL safekeeper nodes. In this document, I use 5 +safekeepers, because I have five fingers. A WAL record is durable, +when at least 3 safekeepers have written it to disk. + +First, assume that only one primary node can be running at a +time. This can be achieved by Kubernetes or etcd or some +cloud-provider specific facility, or we can implement it +ourselves. These options are discussed in later chapters. For now, +assume that there is a Magic STONITH Fairy that ensures that. + +In addition to the WAL safekeeper nodes, the WAL is archived in +S3. WAL that has been archived to S3 can be removed from the +safekeepers, so the safekeepers don't need a lot of disk space. + + + +----------------+ + +-----> | WAL safekeeper | + | +----------------+ + | +----------------+ + +-----> | WAL safekeeper | ++------------+ | +----------------+ +| Primary | | +----------------+ +| Processing | ---------+-----> | WAL safekeeper | +| Node | | +----------------+ ++------------+ | +----------------+ + \ +-----> | WAL safekeeper | + \ | +----------------+ + \ | +----------------+ + \ +-----> | WAL safekeeper | + \ +----------------+ + \ + \ + \ + \ + \ +--------+ + \ | | + +--> | S3 | + | | + +--------+ + + +Every WAL safekeeper holds a section of WAL, and a VCL value. +The WAL can be divided into three portions: + + + VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Note that all this WAL kept in a safekeeper is a contiguous section. +This is different from Aurora: In Aurora, there can be holes in the +WAL, and there is a Gossip protocol to fill the holes. That could be +implemented in the future, but let's keep it simple for now. WAL needs +to be written to a safekeeper in order. However, during crash +recovery, In-flight WAL that has already been stored in a safekeeper +can be truncated or overwritten. + +The Archived WAL has already been stored in S3, and can be removed from +the safekeeper. + +The Completed WAL has been written to at least three safekeepers. The +algorithm ensures that it is not lost, when at most two nodes fail at +the same time. + +The In-flight WAL has been persisted in the safekeeper, but if a crash +happens, it may still be overwritten or truncated. + + +The VCL point is determined in the Primary. It is not strictly +necessary to store it in the safekeepers, but it allows some +optimizations and sanity checks and is probably generally useful for +the system as whole. The VCL values stored in the safekeepers can lag +behind the VCL computed by the primary. + + +Primary node Normal operation +----------------------------- + +1. Generate some WAL. + +2. Send the WAL to all the safekeepers that you can reach. + +3. As soon as a quorum of safekeepers have acknowledged that they have + received and durably stored the WAL up to that LSN, update local VCL + value in memory, and acknowledge commits to the clients. + +4. Send the new VCL to all the safekeepers that were part of the quorum. + (Optional) + + +Primary Crash recovery +---------------------- + +When a new Primary node starts up, before it can generate any new WAL +it needs to contact a majority of the WAL safekeepers to compute the +VCL. Remember that there is a Magic STONITH fairy that ensures that +only node process can be doing this at a time. + +1. Contact all WAL safekeepers. Find the Max((Epoch, LSN)) tuple among the ones you + can reach. This is the Winner safekeeper, and its LSN becomes the new VCL. + +2. Update the other safekeepers you can reach, by copying all the WAL + from the Winner, starting from each safekeeper's old VCL point. Any old + In-Flight WAL from previous Epoch is truncated away. + +3. Increment Epoch, and send the new Epoch to the quorum of + safekeepers. (This ensures that if any of the safekeepers that we + could not reach later come back online, they will be considered as + older than this in any future recovery) + +You can now start generating new WAL, starting from the newly-computed +VCL. + +Optimizations +------------- + +As described, the Primary node sends all the WAL to all the WAL safekeepers. That +can be a lot of network traffic. Instead of sending the WAL directly from Primary, +some safekeepers can be daisy-chained off other safekeepers, or there can be a +broadcast mechanism among them. There should still be a direct connection from the +each safekeeper to the Primary for the acknowledgments though. + +Similarly, the responsibility for archiving WAL to S3 can be delegated to one of +the safekeepers, to reduce the load on the primary. + + +Magic STONITH fairy +------------------- + +Now that we have a system that works as long as only one primary node is running at a time, how +do we ensure that? + +1. Use etcd to grant a lease on a key. The primary node is only allowed to operate as primary + when it's holding a valid lease. If the primary node dies, the lease expires after a timeout + period, and a new node is allowed to become the primary. + +2. Use S3 to store the lease. S3's consistency guarantees are more lenient, so in theory you + cannot do this safely. In practice, it would probably be OK if you make the lease times and + timeouts long enough. This has the advantage that we don't need to introduce a new + component to the architecture. + +3. Use Raft or Paxos, with the WAL safekeepers acting as the Acceptors to form the quorum. The + next chapter describes this option. + + +Built-in Paxos +-------------- + +The WAL safekeepers act as PAXOS Acceptors, and the Processing nodes +as both Proposers and Learners. + +Each WAL safekeeper holds an Epoch value in addition to the VCL and +the WAL. Each request by the primary to safekeep WAL is accompanied by +an Epoch value. If a safekeeper receives a request with Epoch that +doesn't match its current Accepted Epoch, it must ignore (NACK) it. +(In different Paxos papers, Epochs are called "terms" or "round +numbers") + +When a node wants to become the primary, it generates a new Epoch +value that is higher than any previously observed Epoch value, and +globally unique. + + +Accepted Epoch: 555 VCL LSN + | | + V V +.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX +Archived WAL Completed WAL In-flight WAL + + +Primary node startup: + +1. Contact all WAL safekeepers that you can reach (if you cannot + connect to a quorum of them, you can give up immediately). Find the + latest Epoch among them. + +2. Generate a new globally unique Epoch, greater than the latest Epoch + found in previous step. + +2. Send the new Epoch in a Prepare message to a quorum of + safekeepers. (PAXOS Prepare message) + +3. Each safekeeper responds with a Promise. If a safekeeper has + already made a promise with a higher Epoch, it doesn't respond (or + responds with a NACK). After making a promise, the safekeeper stops + responding to any write requests with earlier Epoch. + +4. Once you have received a majority of promises, you know that the + VCL cannot advance on the old Epoch anymore. This effectively kills + any old primary server. + +5. Find the highest written LSN among the quorum of safekeepers (these + can be included in the Promise messages already). This is the new + VCL. If a new node starts the election process after this point, + it will compute the same or higher VCL. + +6. Copy the WAL from the safekeeper with the highest LSN to the other + safekeepers in the quorum, using the new Epoch. (PAXOS Accept + phase) + +7. You can now start generating new WAL starting from the VCL. If + another process starts the election process after this point and + gains control of a majority of the safekeepers, we will no longer + be able to advance the VCL. + diff --git a/docs/rfcs/005-zenith_local.md b/docs/rfcs/005-zenith_local.md new file mode 100644 index 0000000000..7b078e9ec0 --- /dev/null +++ b/docs/rfcs/005-zenith_local.md @@ -0,0 +1,103 @@ +# Zenith local + +Here I list some objectives to keep in mind when discussing zenith-local design and a proposal that brings all components together. Your comments on both parts are very welcome. + +#### Why do we need it? +- For distribution - this easy to use binary will help us to build adoption among developers. +- For internal use - to test all components together. + +In my understanding, we consider it to be just a mock-up version of zenith-cloud. +> Question: How much should we care about durability and security issues for a local setup? + + +#### Why is it better than a simple local postgres? + +- Easy one-line setup. As simple as `cargo install zenith && zenith start` + +- Quick and cheap creation of compute nodes over the same storage. +> Question: How can we describe a use-case for this feature? + +- Zenith-local can work with S3 directly. + +- Push and pull images (snapshots) to remote S3 to exchange data with other users. + +- Quick and cheap snapshot checkouts to switch back and forth in the database history. +> Question: Do we want it in the very first release? This feature seems quite complicated. + +#### Distribution: + +Ideally, just one binary that incorporates all elements we need. +> Question: Let's discuss pros and cons of having a separate package with modified PostgreSQL. + +#### Components: + +- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way. +CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli + +- **zenith-console** - WEB UI with same functionality as CLI. +>Note: not for the first release. + +- **zenith-local** - entrypoint. Service that starts all other components and handles REST API requests. See REST API proposal below. + > Idea: spawn all other components as child processes, so that we could shutdown everything by stopping zenith-local. + +- **zenith-pageserver** - consists of a storage and WAL-replaying service (modified PG in current implementation). +> Question: Probably, for local setup we should be able to bypass page-storage and interact directly with S3 to avoid double caching in shared buffers and page-server? + +WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src + +- **zenith-S3** - stores base images of the database and WAL in S3 object storage. Import and export images from/to zenith. +> Question: How should it operate in a local setup? Will we manage it ourselves or ask user to provide credentials for existing S3 object storage (i.e. minio)? +> Question: Do we use it together with local page store or they are interchangeable? + +WIP code is ??? + +- **zenith-safekeeper** - receives WAL from postgres, stores it durably, answers to Postgres that "sync" is succeed. +> Question: How should it operate in a local setup? In my understanding it should push WAL directly to S3 (if we use it) or store all data locally (if we use local page storage). The latter option seems meaningless (extra overhead and no gain), but it is still good to test the system. + +WIP code is here: https://github.com/libzenith/postgres/tree/main/src/bin/safekeeper + +- **zenith-computenode** - bottomless PostgreSQL, ideally upstream, but for a start - our modified version. User can quickly create and destroy them and work with it as a regular postgres database. + + WIP code is in main branch and here: https://github.com/libzenith/postgres/commits/compute_node + +#### REST API: + +Service endpoint: `http://localhost:3000` + +Resources: +- /storages - Where data lives: zenith-pageserver or zenith-s3 +- /pgs - Postgres - zenith-computenode +- /snapshots - snapshots **TODO** + +>Question: Do we want to extend this API to manage zenith components? I.e. start page-server, manage safekeepers and so on? Or they will be hardcoded to just start once and for all? + +Methods and their mapping to CLI: + +- /storages - zenith-pageserver or zenith-s3 + +CLI | REST API +------------- | ------------- +storage attach -n name --type [native\s3] --path=[datadir\URL] | PUT -d { "name": "name", "type": "native", "path": "/tmp" } /storages +storage detach -n name | DELETE /storages/:storage_name +storage list | GET /storages +storage show -n name | GET /storages/:storage_name + + +- /pgs - zenith-computenode + +CLI | REST API +------------- | ------------- +pg create -n name --s storage_name | PUT -d { "name": "name", "storage_name": "storage_name" } /pgs +pg destroy -n name | DELETE /pgs/:pg_name +pg start -n name --replica | POST -d {"action": "start", "is_replica":"replica"} /pgs/:pg_name /actions +pg stop -n name | POST -d {"action": "stop"} /pgs/:pg_name /actions +pg promote -n name | POST -d {"action": "promote"} /pgs/:pg_name /actions +pg list | GET /pgs +pg show -n name | GET /pgs/:pg_name + +- /snapshots **TODO** + +CLI | REST API +------------- | ------------- + diff --git a/docs/rfcs/006-laptop-cli-v2-CLI.md b/docs/rfcs/006-laptop-cli-v2-CLI.md new file mode 100644 index 0000000000..a04536922a --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-CLI.md @@ -0,0 +1,64 @@ +Zenith CLI allows you to operate database clusters (catalog clusters) and their commit history locally and in the cloud. Since ANSI calls them catalog clusters and cluster is a loaded term in the modern infrastructure we will call it "catalog". + +# CLI v2 (after chatting with Carl) + +Zenith introduces the notion of a repository. + +```bash +zenith init +zenith clone zenith://zenith.tech/piedpiper/northwind -- clones a repo to the northwind directory +``` + +Once you have a cluster catalog you can explore it + +```bash +zenith log -- returns a list of commits +zenith status -- returns if there are changes in the catalog that can be committed +zenith commit -- commits the changes and generates a new commit hash +zenith branch experimental -- creates a branch called testdb based on a given commit hash +``` + +To make changes in the catalog you need to run compute nodes + +```bash +-- here is how you a compute node +zenith start /home/pipedpiper/northwind:main -- starts a compute instance +zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port) +-- you can start a compute node against any hash or branch +zenith start /home/pipedpiper/northwind: --port 8009 -- start anothe compute instance (on different port) + +-- After running some DML you can run +-- zenith status and see how there are two WAL streams one on top of +-- the main branch +zenith status +-- and another on top of the experimental branch +zenith status -b experimental + +-- you can commit each branch separately +zenith commit main +-- or +zenith commit -c /home/pipedpiper/northwind:experimental +``` + +Starting compute instances against cloud environments + +```bash +-- you can start a compute instance against the cloud environment +-- in this case all of the changes will be streamed into the cloud +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith start https://zenith:tech/pipedpiper/northwind:main +zenith status -c https://zenith:tech/pipedpiper/northwind:main +zenith commit -c https://zenith:tech/pipedpiper/northwind:main +zenith branch -c https://zenith:tech/pipedpiper/northwind: experimental +``` + +Pushing data into the cloud + +```bash +-- pull all the commits from the cloud +zenith pull +-- push all the commits to the cloud +zenith push +``` diff --git a/docs/rfcs/006-laptop-cli-v2-repository-structure.md b/docs/rfcs/006-laptop-cli-v2-repository-structure.md new file mode 100644 index 0000000000..ee4e432182 --- /dev/null +++ b/docs/rfcs/006-laptop-cli-v2-repository-structure.md @@ -0,0 +1,140 @@ +# Repository format + +A Zenith repository is similar to a traditional PostgreSQL backup +archive, like a WAL-G bucket or pgbarman backup catalogue. It holds +multiple versions of a PostgreSQL database cluster. + +The distinguishing feature is that you can launch a Zenith Postgres +server directly against a branch in the repository, without having to +"restore" it first. Also, Zenith manages the storage automatically, +there is no separation between full and incremental backups nor WAL +archive. Zenith relies heavily on the WAL, and uses concepts similar +to incremental backups and WAL archiving internally, but it is hidden +from the user. + +## Directory structure, version 1 + +This first version is pretty straightforward but not very +efficient. Just something to get us started. + +The repository directory looks like this: + + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/wal/ + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/snapshots// + .zenith/timelines/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c/history + + .zenith/refs/branches/mybranch + .zenith/refs/tags/foo + .zenith/refs/tags/bar + + .zenith/datadirs/ + +### Timelines + +A timeline is similar to PostgeSQL's timeline, but is identified by a +UUID instead of a 32-bit timeline Id. For user convenience, it can be +given a name that refers to the UUID (called a branch). + +All WAL is generated on a timeline. You can launch a read-only node +against a tag or arbitrary LSN on a timeline, but in order to write, +you need to create a timeline. + +Each timeline is stored in a directory under .zenith/timelines. It +consists of a WAL archive, containing all the WAL in the standard +PostgreSQL format, under the wal/ subdirectory. + +The 'snapshots/' subdirectory, contains "base backups" of the data +directory at a different LSNs. Each snapshot is simply a copy of the +Postgres data directory. + +When a new timeline is forked from a previous timeline, the ancestor +timeline's UUID is stored in the 'history' file. + +### Refs + +There are two kinds of named objects in the repository: branches and +tags. A branch is a human-friendly name for a timeline UUID, and a +tag is a human-friendly name for a specific LSN on a timeline +(timeline UUID + LSN). Like in git, these are just for user +convenience; you can also use timeline UUIDs and LSNs directly. + +Refs do have one additional purpose though: naming a timeline or LSN +prevents it from being automatically garbage collected. + +The refs directory contains a small text file for each tag/branch. It +contains the UUID of the timeline (and LSN, for tags). + +### Datadirs + +.zenith/datadirs contains PostgreSQL data directories. You can launch +a Postgres instance on one of them with: + +``` + postgres -D .zenith/datadirs/4543be3daeab2ed4e58a285cbb8dd1fce6970f8c +``` + +All the actual data is kept in the timeline directories, under +.zenith/timelines. The data directories are only needed for active +PostgreQSL instances. After an instance is stopped, the data directory +can be safely removed. "zenith start" will recreate it quickly from +the data in .zenith/timelines, if it's missing. + +## Version 2 + +The format described above isn't very different from a traditional +daily base backup + WAL archive configuration. The main difference is +the nicer naming of branches and tags. + +That's not very efficient. For performance, we need something like +incremental backups that don't require making a full copy of all +data. So only store modified files or pages. And instead of having to +replay all WAL from the last snapshot, "slice" the WAL into +per-relation WAL files and only recover what's needed when a table is +accessed. + +In version 2, the file format in the "snapshots" subdirectory gets +more advanced. The exact format is TODO. But it should support: +- storing WAL records of individual relations/pages +- storing a delta from an older snapshot +- compression + + +## Operations + +### Garbage collection + +When you run "zenith gc", old timelines that are no longer needed are +removed. That involves collecting the list of "unreachable" objects, +starting from the named branches and tags. + +Also, if enough WAL has been generated on a timeline since last +snapshot, a new snapshot or delta is created. + +### zenith push/pull + +Compare the tags and branches on both servers, and copy missing ones. +For each branch, compare the timeline it points to in both servers. If +one is behind the other, copy the missing parts. + +FIXME: how do you prevent confusion if you have to clones of the same +repository, launch an instance on the same branch in both clones, and +later try to push/pull between them? Perhaps create a new timeline +every time you start up an instance? Then you would detect that the +timelines have diverged. That would match with the "epoch" concept +that we have in the WAL safekeepr + +### zenith checkout/commit + +In this format, there is no concept of a "working tree", and hence no +concept of checking out or committing. All modifications are done on +a branch or a timeline. As soon as you launch a server, the changes are +appended to the timeline. + +You can easily fork off a temporary timeline to emulate a "working tree". +You can later remove it and have it garbage collected, or to "commit", +re-point the branch to the new timeline. + +If we want to have a worktree and "zenith checkout/commit" concept, we can +emulate that with a temporary timeline. Create the temporary timeline at +"zenith checkout", and have "zenith commit" modify the branch to point to +the new timeline. diff --git a/docs/rfcs/007-serverless-on-laptop.md b/docs/rfcs/007-serverless-on-laptop.md new file mode 100644 index 0000000000..e6355f4a03 --- /dev/null +++ b/docs/rfcs/007-serverless-on-laptop.md @@ -0,0 +1,93 @@ +How it works now +---------------- + +1. Create repository, start page server on it + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create a branch, and start a Postgres instance on it + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 + +$ zenith pg create heikki +Initializing Postgres on timeline 76cf9279915be7797095241638e64644... +Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/pg1 port=55432 + +$ zenith pg start pg1 +Starting postgres node at 'host=127.0.0.1 port=55432 user=heikki' +waiting for server to start.... done +server started +``` + + +3. Connect to it and run queries + +``` +$ psql "dbname=postgres port=55432" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +Proposal: Serverless on your Laptop +----------------------------------- + +We've been talking about doing the "pg create" step automatically at +"pg start", to eliminate that step. What if we go further, go +serverless on your laptop, so that the workflow becomes just: + +1. Create repository, start page server on it (same as before) + +``` +$ zenith init +... +created main branch +new zenith repository was created in .zenith + +$ zenith pageserver start +Starting pageserver at '127.0.0.1:64000' in .zenith +Page server started +``` + +2. Create branch + +``` +$ zenith branch heikki main +branching at end of WAL: 0/15ECF68 +``` + +3. Connect to it: + +``` +$ psql "dbname=postgres port=5432 branch=heikki" +psql (14devel) +Type "help" for help. + +postgres=# +``` + + +The trick behind the scenes is that when you launch the page server, +it starts to listen on port 5432. When you connect to it with psql, it +looks at the 'branch' parameter that you passed in the connection +string. It automatically performs the "pg create" and "pg start" steps +for that branch, and then forwards the connection to the Postgres +instance that it launched. After you disconnect, if there are no more +active connections to the server running on the branch, it can +automatically shut it down again. + +This is how serverless would work in the cloud. We can do it on your +laptop, too. diff --git a/docs/rfcs/008-push-pull.md b/docs/rfcs/008-push-pull.md new file mode 100644 index 0000000000..272628e1ce --- /dev/null +++ b/docs/rfcs/008-push-pull.md @@ -0,0 +1,66 @@ +# Push and pull between pageservers + +Here is a proposal about implementing push/pull mechanics between pageservers. We also want to be able to push/pull to S3 but that would depend on the exact storage format so we don't touch that in this proposal. + +## Origin management + +The origin represents connection info for some remote pageserver. Let's use here same commands as git uses except using explicit list subcommand (git uses `origin -v` for that). + +``` +zenith origin add +zenith origin list +zenith origin remove +``` + +Connection URI a string of form `postgresql://user:pass@hostname:port` (https://www.postgresql.org/docs/13/libpq-connect.html#id-1.7.3.8.3.6). We can start with libpq password auth and later add support for client certs or require ssh as transport or invent some other kind of transport. + +Behind the scenes, this commands may update toml file inside .zenith directory. + +## Push + +### Pushing branch + +``` +zenith push mybranch cloudserver # push to eponymous branch in cloudserver +zenith push mybranch cloudserver:otherbranch # push to a different branch in cloudserver +``` + +Exact mechanics would be slightly different in the following situations: + +1) Destination branch does not exist. + + That is the simplest scenario. We can just create an empty branch (or timeline in internal terminology) and transfer all the pages/records that we have in our timeline. Right now each timeline is quite independent of other timelines so I suggest skipping any checks that there is a common ancestor and just fill it with data. Later when CoW timelines will land to the pageserver we may add that check and decide whether this timeline belongs to this pageserver repository or not [*]. + + The exact mechanics may be the following: + + * CLI asks local pageserver to perform push and hands over connection uri: `perform_push `. + * local pageserver connects to the remote pageserver and runs `branch_push ` + Handler for branch_create would create destination timeline and switch connection to copyboth mode. + * Sending pageserver may start iterator on that timeline and send all the records as copy messages. + +2) Destination branch exists and latest_valid_lsn is less than ours. + + In this case, we need to send missing records. To do that we need to find all pages that were changed since that remote LSN. Right now we don't have any tracking mechanism for that, so let's just iterate over all records and send ones that are newer than remote LSN. Later we probably should add a sparse bitmap that would track changed pages to avoid full scan. + +3) Destination branch exists and latest_valid_lsn is bigger than ours. + + In this case, we can't push to that branch. We can only pull. + +### Pulling branch + +Here we need to handle the same three cases, but also keep in mind that local pageserver can be behind NAT and we can't trivially re-use pushing by asking remote to 'perform_push' to our address. So we would need a new set of commands: + +* CLI calls `perform_pull ` on local pageserver. +* local pageserver calls `branch_pull ` on remote pageserver. +* remote pageserver sends records in our direction + +But despite the different set of commands code that performs iteration over records and receiving code that inserts that records can be the same for both pull and push. + + + +[*] It looks to me that there are two different possible approaches to handling unrelated timelines: + +1) Allow storing unrelated timelines in one repo. Some timelines may have parents and some may not. +2) Transparently create and manage several repositories in one pageserver. + +But that is the topic for a separate RFC/discussion. diff --git a/docs/rfcs/009-snapshot-first-storage-cli.md b/docs/rfcs/009-snapshot-first-storage-cli.md new file mode 100644 index 0000000000..3f5386c165 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-cli.md @@ -0,0 +1,56 @@ +While working on export/import commands, I understood that they fit really well into "snapshot-first design". + +We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files. + +Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith. + +So here is an attemt to design consistent CLI for diferent usage scenarios: + +#### 1. Start empty pageserver. +That is what we have now. +Init empty pageserver using `initdb` in temporary directory. + +`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/. + +Save`storage_dest` and other parameters in config. +Push snapshots to `storage_dest` in background. + +``` +zenith init --storage_dest=S3_PREFIX +zenith start +``` + +#### 2. Restart pageserver (manually or crash-recovery). +Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`. +Push snapshots to `storage_dest` in background. + +``` +zenith start +``` + +#### 3. Import. +Start pageserver from existing snapshot. +Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...` +Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation. +Save`storage_dest` parameters in config. +Push snapshots to `storage_dest` in background. +``` +//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage. +zenith init --snapshot_path=FILE_PREFIX --snapshot_format=pgdata --storage_dest=S3_PREFIX +zenith start +``` +How to pass credentials needed for `snapshot_path`? + +#### 4. Export. +Manually push snapshot to `snapshot_path` which differs from `storage_dest` +Optionally set `snapshot_format`, which can be plain pgdata format or zenith format. +``` +zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata +``` + +#### Notes and questions +- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI? +- Why do we need `zenith init` as a separate command? Can't we init everything at first start? +- We can think of better names for all options. +- Export to plain postgres format will be useless, if we are not 100% compatible on page level. +I can recall at least one such difference - PD_WAL_LOGGED flag in pages. \ No newline at end of file diff --git a/docs/rfcs/009-snapshot-first-storage-pitr.md b/docs/rfcs/009-snapshot-first-storage-pitr.md new file mode 100644 index 0000000000..801613e2c9 --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage-pitr.md @@ -0,0 +1,227 @@ +# Preface + +GetPage@LSN can be called with older LSNs, and the page server needs +to be able to reconstruct older page versions. That's needed for +having read-only replicas that lag behind the primary, or that are +"anchored" at an older LSN, and internally in the page server whne you +branch at an older point in time. How do you do that? + +For now, I'm not considering incremental snapshots at all. I don't +think that changes things. So whenever you create a snapshot or a +snapshot file, it contains an image of all the pages, there is no need +to look at an older snapshot file. + +Also, I'm imagining that this works on a per-relation basis, so that +each snapshot file contains data for one relation. A "relation" is a +fuzzy concept - it could actually be one 1 GB relation segment. Or it +could include all the different "forks" of a relation, or you could +treat each fork as a separate relation for storage purpose. And once +we have the "non-relational" work is finished, a "relation" could +actually mean some other versioned object kept in the PostgreSQL data +directory. Let's ignore that for now. + +# Eric's RFC: + +Every now and then, you create a "snapshot". It means that you create +a new snapshot file for each relation that was modified after the last +snapshot, and write out the contents the relation as it is/was at the +snapshot LSN. Write-ahead log is stored separately in S3 by the WAL +safekeeping service, in the original PostgreSQL WAL file format. + + SNAPSHOT @100 WAL + . | + . | + . | + . | + SNAPSHOT @200 | + . | + . | + . | + . | + SNAPSHOT @300 | + . | + . V + IN-MEMORY @400 + +If a GetPage@LSN request comes from the primary, you return the latest +page from the in-memory layer. If there is no trace of the page in +memory, it means that it hasn't been modified since the last snapshot, +so you return the page from the latest snapshot, at LSN 300 in the +above example. + +PITR is implemented using the original WAL files: + +If a GetPage@LSN request comes from a read replica with LSN 250, you +read the image of the page from the snapshot at LSN 200, and you also +scan the WAL between 200 and 250, and apply all WAL records for the +requested page, to reconstruct it at LSN 250. + +Scanning the WAL naively for every GetPage@LSN request would be +expensive, so in practice you'd construct an in-memory data structure +of all the WAL between 200 and 250 once that allows quickly looking up +records for a given page. + +## Problems/questions + +I think you'll need to store the list of snapshot LSNs on each +timeline somewhere. + +If the latest snapshot of a relation is at LSN 100, and you request a +page at LSN 1000000, how do you know if there are some modifications +to it between 100 and 1000000 that you need to replay? You can scan +all the WAL between 100 and 1000000, but that would be expensive. + +You can skip that, if you know that a snapshot was taken e.g. at LSN +999900. Then you know that the fact that there is no snapshot file at +999900 means that the relation hasn't been modified between +100-999900. Then you only need to scan the WAL between 999900 and +1000000. However, there is no trace of a snapshot happening at LSN +999900 in the snapshot file for this relation, so you need to get +that information from somewhere else. + +Where do you get that information from? Perhaps you can scan all the +other relations, and if you see a snapshot file for *any* relation at +LSN 999900, you know that if there were modifications to this +relation, there would be a newer snapshot file for it, too. In other +words, the list of snapshots that have been taken can be constructed +by scanning all relations and computing the union of all snapshot LSNs +that you see for any relation. But that's expensive so at least you +should keep that in memory, after computing it once. Also, if you rely +on that, it's not possible to have snapshots at different intervals +for different files. That seems limiting. + +Another option is to explicitly store a list of snapshot LSNs in a +separate metadata file. + + +# Current implementation in the 'layered_repo' branch: + +We store snapshot files like in the RFC, but each snapshot file also +contains all the WAL in the range of LSNs, so that you don't need to +fetch the WAL separately from S3. So you have "layers" like this: + + SNAPSHOT+WAL 100-200 + | + | + | + | + SNAPSHOT+WAL 200-300 + | + | + | + | + IN-MEMORY 300- + +Each "snapshot+WAL" is a file that contains a snapshot - i.e. full +copy of each page in the relation, at the *start* LSN. In addition to +that, it contains all the WAL applicable to the relation from the +start LSN to the end LSN. With that, you can reconstruct any page +version in the range that the file covers. + + +## Problems/questions + +I can see one potential performance issue here, compared to the RFC. +Let's focus on a single relation for now. Imagine that you start from +an empty relation, and you receive WAL from 100 to 200, containing +a bunch of inserts and updates to the relation. You now have all that +WAL in memory: + + memory: WAL from 100-200 + +We decide that it's time to materialize that to a snapshot file on +disk. We materialize full image of the relation as it was at LSN 100 +to the snapshot file, and include all of the WAL. Since the relation +was initially empty, the "image" at the beginning of th range is empty +too. + +So now you have one file on on disk: + + SNAPSHOT+WAL 100-200 + +It contains a full image of the relation at LSN 100 and all WAL +between 100-200. (It's actually stored as a serialized BTreeMap of +page versions, with the page images and WAL records all stored +together in the same BtreeMap. But for this story, that's not +important.) + +We now receive more WAL updating the relation, up to LSN 300. We +decide it's time to materialize a new snapshot file, and we now have +two files: + + SNAPSHOT+WAL 100-200 + SNAPSHOT+WAL 200-300 + +Note that the latest "full snapshot" that we store on disk always lags +behind by one snapshot cycle. The first file contains a full image of +the relation at LSN 100, the second at LSN 200. When we have received +WAL up to LSN 300, we write a materialized image at LSN 200. That +seems a bit silly. In the design per your RFC, you would write a +snapshots at LSNs 200 and 300, instead. That seems better. + + + +# Third option (not implemented yet) + +Store snapshot files like in the RFC, but also store per-relation +WAL files that contain WAL in a range of LSNs for that relation. + + SNAPSHOT @100 WAL 100-200 + . | + . | + . | + . | + SNAPSHOT @200 WAL 200-300 + . | + . | + . | + . | + SNAPSHOT @300 + . + . + IN-MEMORY 300- + + +This could be the best of both worlds. The snapshot files would be +independent of the PostgreSQL WAL format. When it's time to write +snapshot file @300, you write a full image of the relation at LSN 300, +and you write the WAL that you had accumulated between 200 and 300 to +a separate file. That way, you don't "lag behind" for one snapshot +cycle like in the current implementation. But you still have the WAL +for a particular relation readily available alongside the snapshot +files, and you don't need to track what snapshot LSNs exist +separately. + +(If we wanted to minize the number of files, you could include the +snapshot @300 and the WAL between 200 and 300 in the same file, but I +feel it's probably better to keep them separate) + + + +# Further thoughts + +There's no fundamental reason why the LSNs of the snapshot files and the +ranges of the WAL files would need to line up. So this would be possible +too: + + SNAPSHOT @100 WAL 100-150 + . | + . | + . WAL 150-250 + . | + SNAPSHOT @200 | + . | + . WAL 250-400 + . | + . | + SNAPSHOT @300 | + . | + . | + IN-MEMORY 300- + +I'm not sure what the benefit of this would be. You could materialize +additional snapshot files in the middle of a range covered by a WAL +file, maybe? Might be useful to speed up access when you create a new +branch in the middle of an LSN range or if there's some other reason +to believe that a particular LSN is "interesting" and there will be +a lot of requests using it. diff --git a/docs/rfcs/009-snapshot-first-storage.md b/docs/rfcs/009-snapshot-first-storage.md new file mode 100644 index 0000000000..aeef54898a --- /dev/null +++ b/docs/rfcs/009-snapshot-first-storage.md @@ -0,0 +1,148 @@ +# Snapshot-first storage architecture + +Goals: +- Long-term storage of database pages. +- Easy snapshots; simple snapshot and branch management. +- Allow cloud-based snapshot/branch management. +- Allow cloud-centric branching; decouple branch state from running pageserver. +- Allow customer ownership of data via s3 permissions. +- Provide same or better performance for typical workloads, vs plain postgres. + +Non-goals: +- Service database reads from s3 (reads should be serviced from the pageserver cache). +- Keep every version of every page / Implement point-in-time recovery (possibly a future paid feature, based on WAL replay from an existing snapshot). + +## Principle of operation + +The database “lives in s3”. This means that all of the long term page storage is in s3, and the “live database”-- the version that lives in the pageserver-- is a set of “dirty pages” that haven’t yet been written back to s3. + +In practice, this is mostly similar to storing frequent snapshots to s3 of a database that lives primarily elsewhere. + +The main difference is that s3 is authoritative about which branches exist; pageservers consume branches, snapshots, and related metadata by reading them from s3. This allows cloud-based management of branches and snapshots, regardless of whether a pageserver is running or not. + +It’s expected that a pageserver should keep a copy of all pages, to shield users from s3 latency. A cheap/slow pageserver that falls back to s3 for some reads would be possible, but doesn’t seem very useful right now. + +Because s3 keeps all history, and the safekeeper(s) preserve any WAL records needed to reconstruct the most recent changes, the pageserver can store dirty pages in RAM or using non-durable local storage; this should allow very good write performance, since there is no need for fsync or journaling. + +Objects in s3 are immutable snapshots, never to be modified once written (only deleted). + +Objects in s3 are files, each containing a set of pages for some branch/relation/segment as of a specific time (LSN). A snapshot could be complete (meaning it has a copy of every page), or it could be incremental (containing only the pages that were modified since the previous snapshot). It’s expected that most snapshots are incremental to keep storage costs low. + +It’s expected that the pageserver would upload new snapshot objects frequently, e.g. somewhere between 30 seconds and 15 minutes, depending on cost/performance balance. + +No-longer needed snapshots can be “squashed”-- meaning snapshot N and snapshot N+1 can be read by some cloud agent software, which writes out a new object containing the combined set of pages (keeping only the newest version of each page) and then deletes the original snapshots. + +A pageserver only needs to store the set of pages needed to satisfy operations in flight: if a snapshot is still being written, the pageserver needs to hold historical pages so that snapshot captures a consistent moment in time (similar to what is needed to satisfy a slow replica). + +WAL records can be discarded once a snapshot has been stored to s3. (Unless we want to keep them longer as part of a point-in-time recovery feature.) + +## Pageserver operation + +To start a pageserver from a stored snapshot, the pageserver downloads a set of snapshots sufficient to start handling requests. We assume this includes the latest copy of every page, though it might be possible to start handling requests early, and retrieve pages for the first time only when needed. + +To halt a pageserver, one final snapshot should be written containing all pending WAL updates; then the pageserver and safekeepers can shut down. + +It’s assumed there is some cloud management service that ensures only one pageserver is active and servicing writes to a given branch. + +The pageserver needs to be able to track whether a given page has been modified since the last snapshot, and should be able to produce the set of dirty pages efficiently to create a new snapshot. + +The pageserver need only store pages that are “reachable” from a particular LSN. For example, a page may be written four times, at LSN 100, 200, 300, and 400. If no snapshot is being created when LSN 200 is written, the page at LSN 100 can be discarded. If a snapshot is triggered when the pageserver is at LSN 299, the pageserver must preserve the page from LSN 200 until that snapshot is complete. As before, the page at LSN 300 can be discarded when the LSN 400 pages is written (regardless of whether the LSN 200 snapshot has completed.) + +If the pageserver is servicing multiple branches, those branches may contain common history. While it would be possible to serve branches with zero knowledge of their common history, a pageserver could save a lot of space using an awareness of branch history to share the common set of pages. Computing the “liveness” of a historical page may be tricky in the face of multiple branches. + +The pageserver may store dirty pages to memory or to local block storage; any local block storage format is only temporary “overflow” storage, and is not expected to be readable by future software versions. + +The pageserver may store clean pages (those that are captured in a snapshot) any way it likes: in memory, in a local filesystem (possibly keeping a local copy of the snapshot file), or using some custom storage format. Reading pages from s3 would be functional, but is expected to be prohibitively slow. + +The mechanism for recovery after a pageserver failure is WAL redo. If we find that too slow in some situations (e.g. write-heavy workload causes long startup), we can write more frequent snapshots to keep the number of outstanding WAL records low. If that’s still not good enough, we could look at other options (e.g. redundant pageserver or an EBS page journal). + +A read-only pageserver is possible; such a pageserver could be a read-only cache of a specific snapshot, or could auto-update to the latest snapshot on some branch. Either way, no safekeeper is required. Multiple read-only pageservers could exist for a single branch or snapshot. + +## Cloud snapshot manager operation + +Cloud software may wish to do the following operations (commanded by a user, or based on some pre-programmed policy or other cloud agent): +Create/delete/clone/rename a database +Create a new branch (possibly from a historical snapshot) +Start/stop the pageserver/safekeeper on a branch +List databases/branches/snapshots that are visible to this user account + +Some metadata operations (e.g. list branches/snapshots of a particular db) could be performed by scanning the contents of a bucket and inspecting the file headers of each snapshot object. This might not be fast enough; it might be necessary to build a metadata service that can respond more quickly to some queries. + +This is especially true if there are public databases: there may be many thousands of buckets that are public, and scanning all of them is not a practical strategy for answering metadata queries. + +## Snapshot names, deletion and concurrency + +There may be race conditions between operations-- in particular, a “squash” operation may replace two snapshot objects (A, B) with some combined object (C). Since C is logically equivalent to B, anything that attempts to access B should be able to seamlessly switch over to C. It’s assumed that concurrent delete won’t disrupt a read in flight, but it may be possible for some process to read B’s header, and then discover on the next operation that B is gone. + +For this reason, any attempted read should attempt a fallback procedure (list objects; search list for an equivalent object) if an attempted read fails. This requires a predictable naming scheme, e.g. `XXXX_YYYY_ZZZZ_DDDD`, where `XXXX` is the branch unique id, and `YYYY` and `ZZZZ` are the starting/ending LSN values. `DDDD` is a timestamp indicating when the object was created; this is used to disambiguate a series of empty snapshots, or to help a snapshot policy engine understand which snapshots should be kept or discarded. + +## Branching + +A user may request a new branch from the cloud user interface. There is a sequence of things that needs to happen: +- If the branch is supposed to be based on the latest contents, the pageserver should perform an immediate snapshot. This is the parent snapshot for the new branch. +- Cloud software should create the new branch, by generating a new (random) unique branch identifier, and creating a placeholder snapshot object. + - The placeholder object is an empty snapshot containing only metadata (which anchors it to the right parent history) and no pages. + - The placeholder can be discarded when the first snapshot (containing data) is completed. Discarding is equivalent to squashing, when the snapshot contains no data. +- If the branch needs to be started immediately, a pageserver should be notified that it needs to start servicing the branch. This may not be the same pageserver that services the parent branch, though the common history may make it the best choice. + +Some of these steps could be combined into the pageserver, but that process would not be possible under all cases (e.g. if no pageserver is currently running, or if the branch is based on an older snapshot, or if a different pageserver will be serving the new branch). Regardless of which software drives the process, the result should look the same. + +## Long-term file format + +Snapshot files (and any other object stored in s3) must be readable by future software versions. + +It should be possible to build multiple tools (in addition to the pageserver) that can read and write this file format-- for example, to allow cloud snapshot management. + +Files should contain the following metadata, in addition to the set of pages: +- The version of the file format. +- A unique identifier for this branch (should be worldwide-unique and unchanging). +- Optionally, any human-readable names assigned to this branch (for management UI/debugging/logging). +- For incremental snapshots, the identifier of the predecessor snapshot. For new branches, this will be the parent snapshot (the point at which history diverges). +- The location of the predecessor branch snapshot, if different from this branch’s location. +- The LSN range `(parent, latest]` for this snapshot. For complete snapshots, the parent LSN can be 0. +- The UTC timestamp of the snapshot creation (which may be different from the time of its highest LSN, if the database is idle). +- A SHA2 checksum over the entire file (excluding the checksum itself), to preserve file integrity. + +A file may contain no pages, and an empty LSN range (probably `(latest, latest]`?), which serves as a placeholder for either a newly-created branch, or a snapshot of an idle database. + +Any human-readable names stored in the file may fall out of date if database/branch renames are allowed; there may need to be a cloud metadata service to query (current name -> unique identifier). We may choose instead to not store human-readable names in the database, or treat them as debugging information only. + +## S3 semantics, and other kinds of storage + +For development and testing, it may be easier to use other kinds of storage in place of s3. For example, a directory full of files can substitute for an s3 bucket with multiple objects. This mode is expected to match the s3 semantics (e.g. don’t edit existing files or use symlinks). Unit tests may omit files entirely and use an in-memory mock bucket. + +Some users may want to use a local or network filesystem in place of s3. This isn’t prohibited but it’s not a priority, either. + +Alternate implementations of s3 should be supported, including Google Cloud Storage. + +Azure Blob Storage should be supported. We assume (without evidence) that it’s semantically equivalent to s3 for this purpose. + +The properties of s3 that we depend on are: +list objects +streaming read of entire object +read byte range from object +streaming write new object (may use multipart upload for better relialibity) +delete object (that should not disrupt an already-started read). + +Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content they’re supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully. + +## Notes + +Possible simplifications, for a first draft implementation: +- Assume that dirty pages fit in pageserver RAM. Can use kernel virtual memory to page out to disk if needed. Can improve this later. +- Don’t worry about the details of the squashing process yet. +- Don’t implement cloud metadata service; try to make everything work using basic s3 list-objects and reads. +- Don’t implement rename, delete at first. +- Don’t implement public/private, just use s3 permissions. +- Don’t worry about sharing history yet-- each user has their own bucket and a full copy of all data. +- Don’t worry about history that spans multiple buckets. +- Don’t worry about s3 regions. +- Don’t support user-writeable s3 buckets; users get only read-only access at most. + +Open questions: +- How important is point-in-time recovery? When should we add this? How should it work? +- Should snapshot files use compression? +- Should we use snapshots for async replication? A spare pageserver could stay mostly warmed up by consuming snapshots as they’re created. +- Should manual snapshots, or snapshots triggered by branch creation, be named differently from snapshots that are triggered by a snapshot policy? +- When a new branch is created, should it always be served by the same pageserver that owns its parent branch? When should we start a new pageserver? +- How can pageserver software upgrade be done with minimal downtime? diff --git a/docs/rfcs/010-storage_details.md b/docs/rfcs/010-storage_details.md new file mode 100644 index 0000000000..8429a2d9e3 --- /dev/null +++ b/docs/rfcs/010-storage_details.md @@ -0,0 +1,144 @@ +# Storage details + +Here I tried to describe the current state of thinking about our storage subsystem as I understand it. Feel free to correct me. Also, I tried to address items from Heikki's TODO and be specific on some of the details. + +## Overview + +![storage](images/storage.jpeg) + +### MemStore + +MemStore holds the data between `latest_snapshot_lsn` and `latest_lsn`. It consists of PageIndex that holds references to WAL records or pages, PageStore that stores recently materialized pages, and WalStore that stores recently received WAL. + +### PageIndex + +PageIndex is an ordered collection that maps `(BufferTag, LSN)` to one of the following references (by reference I mean some information that is needed to access that data, e.g. file_id and offset): + +* PageStoreRef -- page offset in the PageStore +* LocalStoreRef -- snapshot_id and page offset inside of that snapshot +* WalStoreRef -- offset (and size optionally) of WalRecord in WalStore + +PageIndex holds information about all the pages in all incremental snapshots and in the latest full snapshot. If we aren't using page compression inside snapshots we actually can avoid storing references to the full snapshot and calculate page offsets based on relation sizes metadata in the full snapshot (assuming that full snapshot stores pages sorted by page number). However, I would suggest embracing page compression from the beginning and treat all pages as variable-sized. + +We assume that PageIndex is few orders of magnitude smaller than addressed data hence it should fit memory. We also don't care about crash tolerance as we can rebuild it from snapshots metadata and WAL records from WalStore or/and Safekeeper. + +### WalStore + +WalStore is a queue of recent WalRecords. I imagine that we can store recent WAL the same way as Postgres does -- as 16MB files on disk. On top of that, we can add some fixed-size cache that would keep some amount of segments in memory. + +For now, we may rely on the Safekeeper to safely store that recent WAL. But generally, I think we can pack all S3 operations into the page server so that it would be also responsible for the recent WAL pushdown to S3 (and Safekeeper may just delete WAL that was confirmed as S3-durable by the page server). + +### PageStore + +PageStore is storage for recently materialized pages (or in other words cache of getPage results). It is also can be implemented as a file-based queue with some memory cache on top of it. + +There are few possible options for PageStore: + +a) we just add all recently materialized pages there (so several versions of the same page can be stored there) -- that is more or less how it happens now with the current RocksDB implementation. + +b) overwrite older pages with the newer pages -- if there is no replica we probably don't need older pages. During page overwrite, we would also need to change PageStoreRef back to WalStoreRef in PageIndex. + +I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete. + +With option b) we can also treat PageStor as an uncompleted increamental snapshot. + +### LocalStore + +LocalStore keeps the latest full snapshot and set of incremental snapshots on top of it. We add new snapshots when the number of changed pages grows bigger than a certain threshold. + +## Granularity + +By granularity, I mean a set of pages that goes into a certain full snapshot. Following things should be taken into account: + +* can we shard big databases between page servers? +* how much time will we spend applying WAL to access certain pages with older LSN's? +* how many files do we create for a single database? + +I can think of the following options here: + +1. whole database goes to one full snapshot. + * +: we never create a lot of files for one database + * +: the approach is quite straightforward, moving data around is simple + * -: can not be sharded + * -: long recovery -- we always need to recover the whole database +2. table segment is the unit of snapshotting + * +: straightforward for sharding + * +: individual segment can be quickly recovered with sliced WAL + * -: full snapshot can be really small (e.g. when the corresponding segment consists of a single page) and we can blow amount of files. Then we would spend eternity in directory scans and the amount of metadata for sharding can be also quite big. +3. range-partitioned snapshots -- snapshot includes all pages between [BuffTagLo, BuffTagHi] mixing different relations, databases, and potentially clusters (albeit from one tenant only). When full snapshot outgrows a certain limit (could be also a few gigabytes) we split the snapshot in two during the next full snapshot write. That approach would also require pages sorted by BuffTag inside our snapshots. + * +: addresses all mentioned issues + * -: harder to implement + +I think it is okay to start with table segments granularity and just check how we will perform in cases of lots of small tables and check is there any way besides c) to deal with it. + +Both PageStore and WalStore should be "sharded" by this granularity level. + +## Security + +We can generate different IAM keys for each tenant and potentially share them with users (in read-only mode?) or even allow users to provide their S3 buckets credentials. + +Also, S3 backups are usually encrypted by per-tenant privates keys. I'm not sure in what threat model such encryption would improve something (taking into account per-tenant IAM keys), but it seems that everybody is doing that (both AMZN and YNDX). Most likely that comes as a requirement about "cold backups" by some certification procedure. + +## Dynamics + +### WAL stream handling + +When a new WAL record is received we need to parse BufferTags in that record and insert them in PageIndex with WalStoreRef as a value. + +### getPage queries + +Look up the page in PageIndex. If the value is a page reference then just respond with that page. If the referenced value is WAL record then find the most recent page with the same BuffTag (that is why we need ordering in PageIndex); recover it by applying WAL records; save it in PageStore; respond with that page. + +### Starting page server without local data + +* build set of latest full snapshots and incremental snapshots on top of them +* load all their metadata into PageIndex +* Safekeeper should connect soon and we can ask for a WAL stream starting from the latest incremental snapshot +* for databases that are connected to us through the Safekeeper we can start loading the set of the latest snapshots or we can do that lazily based on getPage request (I'd better avoid doing that lazily for now without some access stats from the previous run and just transfer all data for active database from S3 to LocalStore). + +### Starting page server with local data (aka restart or reboot) + +* check that local snapshot files are consistent with S3 + +### Snapshot creation + +Track size of future snapshots based on info in MemStore and when it exceeds some threshold (taking into account our granularity level) create a new incremental snapshot. Always emit incremental snapshots from MemStore. + +To create a new snapshot we need to walk through WalStore to get the list of all changed pages, sort it, and get the latest versions of that pages from PageStore or by WAL replay. It makes sense to maintain that set in memory while we are receiving the WAL stream to avoid parsing WAL during snapshot creation. + +Full snapshot creation can be done by GC (or we can call that entity differently -- e.g. merger?) by merging the previous full snapshot with several incremental snapshots. + +### S3 pushdown + +When we have several full snapshots GC can push the old one with its increments to S3. + +### Branch creation + +Create a new timeline and replay sliced WAL up to a requested point. When the page is not in PageIndex ask the parent timeline about a page. Relation sizes are tricky. + +## File formats + +As far as I understand Bookfile/Aversion addresses versioning and serialization parts. + +As for exact data that should go to snapshots I think it is the following for each snapshot: + +* format version number +* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number. +* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile +* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records +* pages, one by one +* WAL records, one by one + +It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))). + +1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small). +2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor. + +I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines. + +Also, there were some discussions about how to embed WAL in incremental snapshots. So far following ideas were mentioned: +1. snapshot lsn=200, includes WAL in range 200-300 +2. snapshot lsn=200, includes WAL in range 100-200 +3. data snapshots are separated from WAL snapshots + +Both options 2 and 3 look good. I'm inclined towards option 3 as it would allow us to apply different S3 pushdown strategies for data and WAL files (e.g. we may keep data snapshot until the next full snapshot, but we may push WAL snapshot to S3 just when they appeared if there are no replicas). diff --git a/docs/rfcs/011-retention-policy.md b/docs/rfcs/011-retention-policy.md new file mode 100644 index 0000000000..fde36c8108 --- /dev/null +++ b/docs/rfcs/011-retention-policy.md @@ -0,0 +1,91 @@ +# User-visible timeline history + +The user can specify a retention policy. The retention policy is +presented to the user as a PITR period and snapshots. The PITR period +is the amount of recent history that needs to be retained, as minutes, +hours, or days. Within that period, you can create a branch or +snapshot at any point in time, open a compute node, and start running +queries. Internally, a PITR period is represented as a range of LSNs + +The user can also create snapshots. A snapshot is a point in time, +internally represented by an LSN. The user gives the snapshot a name. + +The user can also specify an interval, at which the system creates +snapshots automatically. For example, create a snapshot every night at +2 AM. After some user-specified time, old automatically created +snapshots are removed. + + Snapshot Snapshot + PITR "Monday" "Tuesday" PITR + ----######----------+-------------+-------------######> + +If there are multiple branches, you can specify different policies or +different branches. + +The PITR period and user-visible snapshots together define the +retention policy. + +NOTE: As presented here, this is probably overly flexible. In reality, +we want to keep the user interface simple. Only allow a PITR period at +the tip of a branch, for example. But that doesn't make much +difference to the internals. + + +# Retention policy behind the scenes + +The retention policy consists of points (for snapshots) and ranges +(for PITR periods). + +The system must be able to reconstruct any page within the retention +policy. Other page versions can be garbage collected away. We have a +lot of flexibility on when to perform the garbage collection and how +aggressive it is. + + +# Base images and WAL slices + +The page versions are stored in two kinds of files: base images and +WAL slices. A base image contains a dump of all the pages of one +relation at a specific LSN. A WAL slice contains all the WAL in an LSN +range. + + + | + | + | + | --Base img @100 + + | | + | | WAL slice + | | 100-200 + | | + | --Base img @200 + + | | + | | WAL slice + | | 200-300 + | | + | + + | + V + + +To recover a page e.g. at LSN 150, you need the base image at LSN 100, +and the WAL slice 100-200. + +All of this works at a per-relation or per-relation-segment basis. If +a relation is updated very frequently, we create base images and WAL +slices for it more quickly. For a relation that's updated +infrequently, we hold the recent WAL for that relation longer, and +only write it out when we need to release the disk space occupied by +the original WAL. (We need a backstop like that, because until all the +WAL/base images have been been durably copied to S3, we must keep the +original WAL for that period somewhere, in the WAL service or in S3.) + + +# Branching + +Internally, branch points are also "retention points", in addition to +the user-visible snapshots. If a branch has been forked off at LSN +100, we need to be able to reconstruct any page on the parent branch +at that LSN, because it is needed by the child branch. If a page is +modified in the child, we don't need to keep that in the parent +anymore, though. diff --git a/docs/rfcs/012-background-tasks.md b/docs/rfcs/012-background-tasks.md new file mode 100644 index 0000000000..8692b187e6 --- /dev/null +++ b/docs/rfcs/012-background-tasks.md @@ -0,0 +1,38 @@ +# Eviction + + Write out in-memory layer to disk, into a delta layer. + +- To release memory +- To make it possible to advance disk_consistent_lsn and allow the WAL + service to release some WAL. + +- Triggered if we are short on memory +- Or if the oldest in-memory layer is so old that it's holding back + the WAL service from removing old WAL + +# Materialization + +Create a new image layer of a segment, by performing WAL redo + +- To reduce the amount of WAL that needs to be replayed on a GetPage request. +- To allow garbage collection of old layers + +- Triggered by distance to last full image of a page + +# Coalescing + +Replace N consecutive layers of a segment with one larger layer. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Bundling + +Zip together multiple small files belonging to different segments. + +- To reduce the number of small files that needs to be uploaded to S3 + + +# Garbage collection + +Remove a layer that's older than the GC horizon, and isn't needed anymore. diff --git a/docs/rfcs/013-term-history.md b/docs/rfcs/013-term-history.md new file mode 100644 index 0000000000..0c359028ed --- /dev/null +++ b/docs/rfcs/013-term-history.md @@ -0,0 +1,147 @@ +# What + +Currently, apart from WAL safekeeper persistently stores only two logical clock +counter (aka term) values, sourced from the same sequence. The first is bumped +whenever safekeeper gives vote to proposer (or acknowledges already elected one) +and e.g. prevents electing two proposers with the same term -- it is actually +called `term` in the code. The second, called `epoch`, reflects progress of log +receival and this might lag behind `term`; safekeeper switches to epoch `n` when +it has received all committed log records from all `< n` terms. This roughly +correspones to proposed in + +https://github.com/zenithdb/rfcs/pull/3/files + + +This makes our biggest our difference from Raft. In Raft, every log record is +stamped with term in which it was generated; while we essentialy store in +`epoch` only the term of the highest record on this safekeeper -- when we know +it -- because during recovery generally we don't, and `epoch` is bumped directly +to the term of the proposer who performs the recovery when it is finished. It is +not immediately obvious that this simplification is safe. I thought and I still +think it is; model checking confirmed that. However, some details now make me +believe it is better to keep full term switching history (which is equivalent to +knowing term of each record). + +# Why + +Without knowing full history (list of pairs) of terms it is hard to +determine the exact divergence point, and if we don't perform truncation at that +point safety becomes questionable. Consider the following history, with +safekeepers A, B, C, D, E. n_m means record created by proposer in term n with +LSN m; (t=x, e=y) means safekeeper currently has term x and epoch y. + +1) P1 in term 1 writes 1.1 everywhere, which is committed, and some more only +on A. + +

+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=1, e=1) 1.1
+D(t=1, e=1) 1.1
+E(t=1, e=1) 1.1
+
+ +2) P2 is elected by CDE in term 2, epochStartLsn is 2, and writes 2.2, 2.3 on CD: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=2, e=2) 1.1 2.2 2.3
+D(t=2, e=2) 1.1 2.2 2.3
+E(t=2, e=1) 1.1
+
+ + +3) P3 is elected by CDE in term 3, epochStartLsn is 4, and writes 3.4 on D: + +
+A(t=1, e=1) 1.1 1.2 1.3 1.4
+B(t=1, e=1) 1.1
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ + +Now, A gets back and P3 starts recovering it. How it should proceed? There are +two options. + +## Don't try to find divergence point at all + +...start sending WAL conservatively since the horizon (1.1), and truncate +obsolete part of WAL only when recovery is finished, i.e. epochStartLsn (4) is +reached, i.e. 2.3 transferred -- that's what https://github.com/zenithdb/zenith/pull/505 proposes. + +Then the following is possible: + +4) P3 moves one record 2.2 to A. + +
+A(t=1, e=1) 1.1 2.2 1.3 1.4
+B(t=1, e=1) 1.1 1.2
+C(t=3, e=2) 1.1 2.2 2.3
+D(t=3, e=3) 1.1 2.2 2.3 3.4
+E(t=3, e=1) 1.1
+
+ +Now log of A is basically corrupted. Moreover, since ABE are all in epoch 1 and +A's log is the longest one, they can elect P4 who will commit such log. + +Note that this particular history couldn't happen if we forbid to *create* new +records in term n until majority of safekeepers switch to it. It would force CDE +to switch to 2 before 2.2 is created, and A could never become donor while his +log is corrupted. Generally with this additional barrier I believe the algorithm +becomes safe, but + - I don't like this kind of artificial barrier; + - I also feel somewhat discomfortable about even temporary having intentionally + corrupted WAL; + - I'd still model check the idea. + +## Find divergence point and truncate at it + +Then step 4 would delete 1.3 1.4 on A, and we are ok. The question is, how do we +do that? Without term switching history we have to resort to sending again since +the horizon and memcmp'ing records, which is inefficient and ugly. Or we can +maintain full history and determine truncation point by comparing 'wrong' and +'right' histories -- much like pg_rewind does -- and perform truncation + start +streaming right there. + +# Proposal + +- Add term history as array of pairs to safekeeper controlfile. +- Return it to proposer with VoteResponse so 1) proposer can tell it to other + nodes and 2) determine personal streaming starting point. However, since we + don't append WAL and update controlfile atomically, let's first always update + controlfile but send only the history of what we really have (up to highest + term in history where begin_lsn >= end of wal; this highest term replaces + current `epoch`). We also send end of wal as we do now to determine the donor. +- Create ProposerAnnouncement message which proposer sends before starting + streaming. It announces proposer as elected and + 1) Truncates wrong part of WAL on safekeeper + (divergence point is already calculated at proposer, but can be + cross-verified here). + 2) Communicates the 'right' history of its term (taken from donor). Seems + better to immediately put the history in the controlfile, + though safekeeper might not have full WAL for previous terms in it -- + this way is simpler, and we can't update WAL and controlfile atomically anyway. + + This also constitutes analogue of current epoch bump for those safekeepers + which don't need recovery, which is important for sync-safekeepers (bump + epoch without waiting records from new term). +- After ProposerAnnouncement proposer streams WAL since calculated starting + point -- only what is missing. + + +pros/cons: ++ (more) clear safety of WAL truncation -- we get very close to Raft ++ no unnecessary data sending (faster recovery for not-oldest-safekeepers, matters + only for 5+ nodes) ++ adds some observability at safekeepers + +- complexity, but not that much + + +# Misc + +- During model checking I did truncation on first locally non existent or + different record -- analogue of 'memcmp' variant described above. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000000..fdf6885929 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,95 @@ +This directory contains Request for Comments documents, or RFCs, for +features or concepts that have been proposed. Alternative names: +technical design doc, ERD, one-pager + +To make a new proposal, create a new text file in this directory and +open a Pull Request with it. That gives others a chance and a forum +to comment and discuss the design. + +When a feature is implemented and the code changes are committed, also +include the corresponding RFC in this directory. + +Some of the RFCs in this directory have been implemented in some form +or another, while others are on the roadmap, while still others are +just obsolete and forgotten about. So read them with a grain of salt, +but hopefully even the ones that don't reflect reality give useful +context information. + +## What + +We use Tech Design RFC’s to summarize what we are planning to +implement in our system. These RFCs should be created for large or not +obvious technical tasks, e.g. changes of the architecture or bigger +tasks that could take over a week, changes that touch multiple +components or their interaction. RFCs should fit into a couple of +pages, but could be longer on occasion. + +## Why + +We’re using RFCs to enable early review and collaboration, reduce +uncertainties, risk and save time during the implementation phase that +follows the Tech Design RFC. + +Tech Design RFCs also aim to avoid bus factor and are an additional +measure to keep more peers up to date & familiar with our design and +architecture. + +This is a crucial part for ensuring collaboration across timezones and +setting up for success a distributed team that works on complex +topics. + +## Prior art + +- Rust: [https://github.com/rust-lang/rfcs/blob/master/0000-template.md](https://github.com/rust-lang/rfcs/blob/master/0000-template.md) +- React.js: [https://github.com/reactjs/rfcs/blob/main/0000-template.md](https://github.com/reactjs/rfcs/blob/main/0000-template.md) +- Google fuchsia: [https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE](https://fuchsia.dev/fuchsia-src/contribute/governance/rfcs/TEMPLATE) +- Apache: [https://cwiki.apache.org/confluence/display/GEODE/RFC+Template](https://cwiki.apache.org/confluence/display/GEODE/RFC+Template) / [https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process](https://cwiki.apache.org/confluence/display/GEODE/Lightweight+RFC+Process) + +## How + +RFC lifecycle: + +- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body. +- RFC should be published for review before most of the actual code is written. This isn’t a strict rule, don’t hesitate to experiment and build a POC in parallel with writing an RFC. +- Add labels to the PR in the same manner as you do Issues. Example TBD +- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code. +- The Tech Design RFC should evolve based on the feedback received and further during the development phase if problems are discovered with the taken approach +- RFCs stop evolving once the consensus is found or the proposal is implemented and merged. +- RFCs are not intended as a documentation that’s kept up to date **after** the implementation is finished. Do not update the Tech Design RFC when merged functionality evolves later on. In such situation a new RFC may be appropriate. + +### RFC template + +Note, a lot of the sections are marked as ‘if relevant’. They are included into the template as a reminder and to help inspiration. + +``` +# Name +Created on .. +Implemented on .. + +## Summary + +## Motivation + +## Non Goals (if relevant) + +## Impacted components (e.g. pageserver, safekeeper, console, etc) + +## Proposed implementation + +### Reliability, failure modes and corner cases (if relevant) + +### Interaction/Sequence diagram (if relevant) + +### Scalability (if relevant) + +### Security implications (if relevant) + +### Unresolved questions (if relevant) + +## Alternative implementation (if relevant) + +## Pros/cons of proposed approaches (if relevant) + +## Definition of Done (if relevant) + +``` diff --git a/docs/rfcs/images/storage.jpeg b/docs/rfcs/images/storage.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1d72a018dc462a74ad01bb17561c98efd0745bca GIT binary patch literal 431075 zcmeFZcT`i~zb(2$C{jc35SoA>B3+6JHj0Q~p$jOz2-v8G1VMTS6#=D)h|)U>n9vac zL8^d|f}$cwA_`jw;cdRZ^Ugi@-7(&H=lyZ-x#PMWf;oKc6}0XYJh&yCi`3 zw28S10D}Pl4Eh7?&H*O?IMd#-_Z1Eu%&g3N$39k87FPCs931TX*x5O_5L_IbJe=(8 z+2blR- z_+>SYu?n2K!X|fJQ1eb^@jm(Eb)7=z-%}N|+-^j(a|nxwiiz)6R8l^mqOGH=cSPUd z#K}`8re@}+FI=>-wX=6{bocP|^7irdyLl@(Bs45MBIa&vTzo=eQdai8`wwy+=H@*q zd0P6c?D>oG`i91)=9ZVQTD!V?di(m{47?p1pO~DQ{xCC3rYwH?y!7Skw`JPT^p`ueYd{XfP9O#+OG8JbeIy>Y>qLZQEKK4unK4OaeR=h&`X7m(Au zvrq7NW^r97yS&zUs*u}__Z-3s+T{JTy`lXxvj1-bi~j#LvVR}ge;d~t3uekBSnj?0Z7 zs_}T)jIT{|S8jTlWP7|VfjPQZ^7{K;AR?~ph5gs9fQ6bDA4^|9PT{}w@yr)aNwd=2 z^jaVfpm#tYA3XrpIu9L0?ekR&SklY#n*7W!y8iPAlP0WBD<)->Z5~-4-|w^w%o>uK zb7&XNQ-9+Vuj7J41E;Q?9hdu~rO6}~Wpt833&FHmY?uo8qA+)!&IeIvc$C=93VEk!d%;uyoOjp%rsZ%sUs zT1VF+zxFq2w3yM}jK7d}*wJTFY*x`k&Wv$Q@A})?j$XhTtl7TIc()5Mx4R>vu=~Sy zfo)aiyamj&uai6dlVWJw{_)G7(4 zMEla2KNw}yiAcN!?)1&wE?MiJLCYH>Pvbhxu&TJO+MNeRr~G|ye)e$)nUFm{|Zd`-n3uKg$IvJ!}F8T&cE&1$!*za3rvZ#@p zM1QxU&sWVeCR+db1c34OhnC2#oo7!7+cy1i4CF|l)o{R0?yYjx@Y${}*HE7di;EDi zb^-VFxsonq(@8s&c>DYO?gE{0HSA#VUY9NY!`sE;RVFU>WRHtKuV z-=r9ZA6yX)S&Bl7?gAdGhNQkK+FI9aX#dt|PhsJ+(Ugy0XaKgAXYjT3G`&pj`fldBK?a}MTB6MGh zYdoee{sFo^M;&;HVe6vs=%9gh1eV!=qg|Vuxc1ju$y9) ziW3edBp2+D+t(!JwR(u9{ZxuAt*<=u^OsE=RrFgGMVngyp_69tkE>4 zo$Sr2^Q*%X@7`rn!tyaDB?_*kl7V=K=7sK|Gr^~A$NUR*^Ddq5>4*3s+n){}FTD|Y z|4B>P{YN?BG_Zl2>1S{Z22e)$9m3EXQPk#~BJShom{Brgjwc)9Os>kjjI#|%)n7U% z3;euz$iW!(73^FzgpwIyU%Nkz)3l}e)obc>ypELDlxkF;m}g6K)Ib4jwUjL4x z+Tmt(H*Ib|Ms#6W$FIVdR2>dh2O@(?-grCr&)Z<{!G0lEs-P8YRtLUV z!*1J7Rm(Sy2Ro}hvL7&{)>%kaj_`SP*z%Ab0I}OW@U_{$)!P4}YE7%%bQh4@c(Dt( zT-XJ^vPPlVcY!NMb^9H6DllwMmP5WDeNd|N`~HySn3-qoOVbN0HVJee`0b6*$=`^X z3mc1LzbpgEi!mlNo&MNhy`cLZO?f8G>-jr(TJ__%i$>XM_izyfC7Bbnpl%%d)xc0k z(9}|IIspn3^_H;9l}Bww$(M288n80bq1!?9gJ>3pIAdfNKzO(O45~6Z5E}kMc3HjJ z+o6nkbM}YzZJA$12m+3jL-^^XGUAy(c8kY2|95Mf0{6d|5+JE(+BI zExy5B;LRYlfzciuv?CT^EJ7e<;~Lhw3q?SSUYw|rKiOH!l@MbmE}WNlzx-*`QLjiQqR6MENpN*CL48% zH-opR{e~7VETBU&Rulr5jZa`OxhB@N37tk4BdzL6p-rNfrhbf#@iiY|8Gfo&FPA0+ ze@ni24FIkw!#DIcT+EEDwhv&m8>XwC;9Fi(ZS%cva($IjieE_-vtx_nDq%Z%!w0^e zvlmFwZkW?l*18vA-6Y$5^<<;{2h*?RS-w(#@$LByOtIL~XOXB<_KXzIvz)vLPLO;1 z5p;9J8j!S=(J(9ifd18VgT!|5SZ?cjNu4K0Jh^0aX}iW`-)=Xs4Wk z@Wmc&7+no)Ts@W&+G<-O7>aanPaspS9Xo%Ke7{z+GWE_QYc_Idualu+P#WX#`0DDF zq$za~Ki+;O8T&DfFp|u<4(lWRXRO(N!LLw9$bI+2c}hy@XzKjq`L;ya0-|}ezWkN< z_wU2lK3cPVGNlVS^-bgDcV1Lu)c60YcMP*Kt*=k|ST`e>ZF0yt`WIK1&(DaAazhSa z+inBC^ALsjhHkKnGqRx|;~CP^zt==&bxWVSEJxh3JL{e~HyCR)I>+a}kE0r{yCkm^ zg&45{m6bXRIWfFDr8+nsP{}IYYP88;HNfv(-(gIf>k-WXaSv+2)f+>R0~G)`U_~cWYTqLJh=*K=H9~BoM%fkjIz!PDQqf?vV9imK*wI72ZH@% zbnG>kd<}}!v4EvlHXlbj$~|%?La&JH1GEgSUI06dG1`IHmgg=Ir(UuE29Z5umh+@~ z^d7WUt<-;X;e7be;EZ#A92fkG@ugqDWox#jX}Xa6zh)-TNNxYGEC&#pnTDi~Ed&3| zOw?DgyYD~!mbDwTX0zNrfL*{b;k4Fe-JUJUa+i3ntDQ{P~ z4;L*$5OFasiiLfJ`;533C7vt=-0qI>DwDnWK<#Md4avewO+bkDe@D~)r`7HM$Dc@S zL4Fs=ybY0pnghGQFSs!>4q3}aFA(Xr+TM2w_f6p!Y`KtX14W%0i78V>|>cGOB8@ur?(Cjvo`1>Xp72dz{2D?aEN7Ln{IQ0 zg7UF3b6no;ZjF7UZM=%t_od07ck%p2Xb0VA}mhQMod%VfWoEc>sgcwTtAN|(|Mo)$M z&cW9ZCnAvY3V(VsV7owdUJK*VkLU1pZw3dXUMM|Y9NGH(bQkzmKO2QMF;b!xGQ`#x z?4L=5ItizH|88(sCr*eO2a+J*Ld?df!cBK@9YfET%j6+Fd;b z3*=}#HKgy;T}{;$rNheHME*JU3qi$hc#<_+H=24Lmx=>Wy7c4Jg$Alk$%GE4s$oLY zs-J{Xa~hb9oEhIts*yJlzVzzSl?kTfBp@;p8d}n5QyA*2cLyF4Q#1zi0-s8SbUHTjCkrgr6u(;wXco5 z&!u1F+G)4;CZ1ugOifj$<3c7E;yymT6Riv<2hF)JHm7czU`HeR(=bYsUyjFdMk<;6 zx*a%3JgYiRd1->dm%%2A65Mz$6q92(wQrkYGm-eX< zw?dTgwQV_k$^Damvd`44eZ$UYGqU@t+$h^KY%fC&dU+&BAfxih#8ekSDk^a?QuOHS z-|q<9cPB1{YkXArah(5Rtf#_Zm9S3MsG}CW1q#qxE`+%-x=ov?IWb12&zJ2qW;^|h z?2rC%_$Z^MEVm!1T_!CaVU0q49sT93)a`hvkUY%VB_XACx+&@H+bN+rN#2upOEW*1Xr@+GmbHNqY51OJG=>N=xzW~T~+EDj3^5*1>8e&i}xyDK%Yj6|k~VA}tVFo~bA$iw91Kbc0LH+jzVDV_ACrt%)DJX+?-)oW~ldWEY3_!io3N4sB9U zWfWJwV*xsq7E*Z*Gfz&pIca3eH^4^M&M}68AnZ(Go#;+=usWw7lG=bH3!=oMe@GR6 zAOvok7QC1IGLq~?R43}MrWqSr&4_IHBI{eaIY;8*eTCHQof?NiJ3Yb-(QR(&1Q*sd zXKCP3xc+?G;BR*U{}E7$K-aPBnL{QJsV!)-$hL4YYx)Ly1<}0rbE&{pYiRq7 z{YYVIo7Mcg@D5-TJyS+1qiYd5BjIEOU4cN&z+GvY!};u_uM~t921D}Ea?3zVP=3qW zqty|y4&YD2Mh*l9UWi%wMbv9rC_}`R4yE=vUmpS#LDbZpnt`8cPhxh0W=rM^RYT`R z-|6@Hz=?r4FEl3@vvIk0C#T)Bnn?87nAM5PzW%~qW%knP0G9*&9=7X3JfQWeVc!w` zB1DRjIg}WDGZrk>@1l&JH+c;m_H*R%IQ7fdJ)bACOMNWL038RN&omw4J3Zn#y~tz{ zhJ1Z8*Opgq8P2;h1s(fE^Gz?}T6yZ%t|-(FwLi`XO(GzC7X+x)*+I{I#VuPMXJ+QA zKadkmnC{RAK0V6L@_~Os%vLbiOUcMtwthWVfp+=Da3fg?_Zd~cYXg@LRjDi3aMZK# zqzYuPQaD-rLZYD%5$RM16+ZWOrzg6Ig2@zIDjyNaxjYroqpK^fkv@{Q;BjB5H2kdK zJ%Fqg(_`( z$yET?DPGM}cCdg4{3ibG-Jc{WtCMQl=3mE5l^qZ-Pi|SW)nMP@s1`M`y8xS!mHT3i zbl8){e#eDem5=rv8~(mmf;1mh#;Ls+q;j~A4MTrpTncUNh+iPt-?sTbT*pSNs+DF zFY^|B<_2FjJYo{|@mNlS$-k7z=!iZBfbu=i{WN)zR&ua_3;ddVdFYJwa^3;o5~&z- z$>K}fcU? zq@(Q!{B^?rI^lnv@c$QWRm;e#+|>swKG{^IeyY&^_4tZ+Yx8(fK)s8Frane51XdoI_EEtn`?Ngu9rQ_d9Ltwy(3XVt*i2VN$mawl<7iZ zf)NSu1lblhdxL}+6DVy^z8V9Y`h{i#gRJx%aVcf%*x~19;xfe!C47f7PJcdbHe{6Z z2Cdt{#u#-)VpZt-!DK42OPXn?u!$)B%|&AZJ5b>_#cHK)Zl$R+cr|s>%r5+pEsi0LPO|P$z4bL4GLs9Bpi`!fvjA=b+-K6-T-=Bi7*V*B^Oa*v(2cNdk5f*c;sO-c=2 zr-rVK2EA898N9d^LNl%_JQvlwv{N|z{qvu&b@cU?AtGU^Zx#qzb*F7pYC4=%yIP2x zSeU=7z|yqN3GdrH`AgZJqCoV*#q+$EvVOxuHV=NDJ6ZZ9(*&9|M^$(+dP`TA-yBDo`f})@)BhzZS5izZ0J&#=i9Enzu z>Q?5tq&s6K{&Dv1p(-2-7{qIptgudCt&A4<8qxK~=oWq-A=}XsSnlrjm)XAH%!$)m zk*!PD!u@DSe?8X(T-yce34K-xI7x1GH+E^WnD>m zo!1%7->C^lkrldnHRQz}I$-KU?@r&y*&b^JgzxjJ5Fn3qII4%~% zM+3~Zh0yA|gq;>40;M2ZRX^#r^MN3D+WYlWPqXrG+rJO`acD-J=`O8@E*_bN>&?!6k4Kjh=V2W!nE4yt8>SN*vjQ>l;Z#!tjzaVcle0r==?; z@WsuF&a&ZWzE93?Pz}t64G(I(`4o66<;pd2Yjct+5W4jxS!YL!98$bkfn;`F_#TqS z+}eneFh%+md$ z9<&I?4n&h{5>SR<w~1@JXlhN~zx1ODAu(sX5#1|^NY zbUym=_Y8UL$n08oz7_lO^_g#W5s4Yq6%yLTNo%QaA-)pflvX~?^^QZp2tOSQR+V5U z84@7k867tD8_Dur+}*_)oJH%cG^c-l`$56P4d$qwQeh^Vd-2ZZi72O?D!e`fg)Ag) z^zR};mW^9U}JSF*y3fb1qKFY!R2Nul=VZd$wSfGVZ;P`!x%yf({IyS$ZF~7SOtvI z1Vie(o`_b**3n(yp@Pr*+J^Es3Crp4;D^E#`1dKm3&5jL0xax7r@Qi_gu`m~;@64` zMA=!&_iVbIW3A-BB7SdP)MWi)?!A&`-Yc)P0gMN@O}juQ_6NlgIooCTFxNZY$ld7r zcj^62>)4T}s+OM{^zCOv2G0!5&D?0`2`}PF-gpEyEfS(}>^cZ(x;}W}1^vWSRkcj? zQRxn|G3LzLam}8zl>~j+i@(KWn2gz+$VlPu3>olB3HZgHYEoSC-7zkn^~Fu{%q58m zr<^FWDUVv$$M*LISw7tm`~)EMce1d1bGuFAzR-&k#P7{5SCfRATL5aI`<%o5>Y|G# z{N=-vfrsC{&u;PcnPbzp0jkI&=>#F2^x5tn9IS=``yL|A@%~ri>qc68flpIwM!&#i z0nvmwU+KdYvX`&8nJ(;ufY)yEOD{T_Aq(DqiqQvQ)pXNlTHuTQ4=RG6E&|!^23spC2|bT+Om~ zmu`K6%@X^GNd~T~SBfz#jUc>w0aU*@wy?3-&zFlD2#`rK?L@iW18$imHQw1g{S?t! z4MRhP(l{lw!-~a+p#VBlRUN=W-1B9c{U!Tvcwc{e2m3!MvPpJ-9v-~BIxE(9_F5(E zE8d6}lwhPrCAiS~?vV0#Zaa?sg3QtJ?oquh%g;U6a)l>P#~IE4kZndiE;k4<%3 zgKke7=_AB!+4Ue22%Jp_z(2HpM6mVf@|$XYJ(UKPw+YK>tj3;O{D;lh^r)_T*qg8m zNTT-BHEHP`YB#&2k<{S9-jcAi6^o5XXJL2o`kd5r?LADMKWCyYMBj`ymZG`t-9i|4 z6pcU$gt?Mf+bD&cVB>+wb~_)BZQU$KTN6aH?x~-Xwzdo3UtMDPWX(FgYC*$M*dUYm z?lgobzSR74U}9zO?I1txd_Gl!Q?iC-o{3U3b^ve1WTyh)18;=zJn8q2D;b-=m=8g( zy?Kf|E zg@&m>?XS1yPbr$byu+{SaL-aQDC$4NZv`2VUMNUEv3(w_AqCbzrQ(>8RIzVGxcj2R znL-bIHXm=%!=97+S;oa4UaZ1zwmSidZBu$A_=0+~uaFyKNZp2Fg$6%MOMAZIh%dgf zbZ^WZ!4bW1DNykK{ZI3aj zVYpRY(`1NARiCe<@vchNNssFjz$H*=u}}bG0bU>%N`ZqNEqLy!rhK7SuSeHH!>7)B zq*jXA#2ItJl`o4jY2^XyYct+Z9&t^>QS)$J@Ak+?9GaTum*`!F^Akf;+y%n(o7Yk` z-9NPD>dCI#`I1KA>R2d?H1AX&M8Qr_*_`q_HXWT@EEbEGcCrqzv<(jYq*sO~)dpy? zEs%b7$nayQb^&R|0XmHOET)>S3L+}$R_&9)Hg&Cy<0pp#GwaNqixb|N za~L00r?@7!pCZ-7xyaC3;^+t*CBWZ&>>0k1x>JFB>hdnZAz-2iQ)h%JPGs=*15q~% z8@zkfB`{v#u@Wdl$5Ok`QCVA_HxBA1N4A?KWobSwGD{$PSc|e+Ukm$qxCfpMA|UK- z=~Yk031DE0=}`v6=te7+WrY7z|CR!uc)L`uu!(ALQnFQLi0jO<(1gclhbTlMwAk^g zVM-gBR7B?pH$?B~=47+~yN40UgKf~h30MbqfGCbvz=VPB*B7mlkgQEs zakGsEO}%#`yowrBFB$)fqV|z34Ex_M-a2#a3jjClz(I%=R@||3~jQ(+i)TGsxk{Vg)bh6Q7Xzm*E5-btv8${Pg2s02v9x0MnS;&%y@J z>qg)!E_?j^m9W{@_V8(7c}RPfmkAWRH#c(u1LCY6#XALyz=+dCdaMA<5dv+h?dem+ z>67(#zKTRmlOHGj-r4GeWJbR@a$?iO!B`j13l8=`)to5k-^Wm+TCUyIN%gl%`muhZ z{h(KEX1oUevn?AQir;I-`hu_}Vm%H5(?aQ6BSH)Gb5QM0*T^|BdFpv1yR-`wn*cI|`+|2x25b~qm?aCMq9ROt z38G*;RQ7cv`9WeAp5rymtM19S`UQutAs&A=$I3sQY_oZo{k6PjG3qP}aE0*>DS+{% z#gnGv8FEm_21SxALv0*}-z7rD!Et`?RE3_Tw2FVqe90qyOjKt{khCrh@H$dGlkbk3-(RLI*zrz*yQkrSLA=7sred z8N~wrtq_)AEIkhCAVueLQG?IkxY(%ly8BF+$eRTGR7L8Cq~&*L1>#4RcH%>bzw>v2;K#3ThZON}Zkz|-(*za?UEhoXXojSX3M$b){o>17 zXH{$Sb4zhL$6NO8KY8tGo0970ldIR0fMM|F-WXEQzE+ZSF&d(SJG2`wK<&+aIc{|2 znul&{exjU>jiQFwy0EF|zLkr|JL7IP!U)p%J$h) z5u4EkCZsYM05bB6XtJwLra#Q^^HwmSic!A{PBUSuJVU_Xw}0k z=3gVCU{W*RjZ;cbPi{NNlh2y279Vgh&OMhhMR%sUl$G{y!* zc#ooa3v%q9iBN$@+hk@<+j@mtT+W1lWl=~M&;7@3j@7|J?Kq+8Y!JD3o2x#mqI=IBv z(swnig}S-V5-sLBzR7E^)b`6~z7p1m?eIOO6jcc$`lHlnJddgM+x;O}Rzll!00w=i zMkFrFIPh7Ule=;q7fkxff8eJWHxRN3$tO_;U@kTNJQ!Trw9{L*VMD#|VxJv27wd3N z%BDrAo9DTi0-v!cj$@B)(WY0$Xl!J*Lt?nu1 z6i<}9-2LOQ<+rdg>^K@~1f)8VHtL{red*>l?VtUel3~i4>Q=8+KOk4}c2K-Ame1Tx z$b9i}$w9!NFy7@LjV+lW2IBe_W75ed1D%QD3+&-0XtpOoCC+AQEHyf(%X zrF0Bf7P)P|OoRoZ0RQF%_S&(r4TO|NhQsiII|?!a$a}{^eU4YgZkqI@cqqehe&}?f zAcJiojGss@gGx%ud9cvt_EGa+?+%M(8ILiM-JMw+baYdIfmJR{P(18-iS3;9_93Jiq zQS5n(&e^_)(=B*CE4`HmLG`wsa$39`@1OAgR29y>hWeVpG_vYX8>F~)pyOw~lTjiV zfk}pyaDP6Zfe`A`%zomBnqGVj*(x2058NlBo1+e7R*BYL zGN?PS&LXq(584t}9@9oB-u(plGD8tUTP+9h%5&eXp4Q0ManBUFgE`6zbxyuNxeKUj znVg+3%^-bIDH6gQg*1t0RonJpGU-q;M6zqCF4?7}tA(k=;cc~vIORfE3ice=n9&v6 znuW2C==9UY_QPaN6m zKzxJNc7e8&(^C@_LF^7|S1s@NEFFCQ=jEiFyRuuH*{H@babb^-PtQF3EyG>|slXMb z*eNU$WcQ?UI=!Zzt$1c09=w=9ANLQCi8Jk#Ln{Ncr>7i^d*1#-P5n3Oz;93vSBh@L z@na6{iB?vGiQurxLkDjWwVNsrR$}^idsXQ}!iPgvxfcIXRjec-Js0wgovOpWpP(LP zysGxiliVt;iN755E5Get)SKocYhCAuSO1*g$9{E>GS;`6N#2VQU5v~)N(N)H7bEDY z8pVPG+PB}!L?&$8dP1;!u3tuTJNg)N)J+`A9_oVW(3-OAMvCqg>9XVzwoB&1=@bis z-kZruZ;cAsut|v_E28>?A)zq+HwsX*moQW=*iaFxFzPzgFN%Zwi(VH|THwfPkkG6c zYWV3|Zb`srcMsPC7f;X4hpGbYqpZaLEH!PQt|c3tsKgu^H@f*eOm6$^wE8X4V25_S zgN5I8H-iHA`zyM!y{kSGuA(n_2ilXfws9IsV$ zS-LWE4ZKOK@+{|1_stuOvJj$O^l9iGLaekHjhI5`yiJ-9a;O&(X_X=e84PUlHMOYJ zzJWz~GP43HF~-uk?L*MkjxyQ+-4CjHA@CKeiPy1d7(e^mV;4xgXSw82lPm7gUl*$R z8Y<4an?t;DnEJdZc6>!3*#BL?x45OKBHS29Ws){l^L2v zA5jd?>qF$l%3=;rU|?0tYMR4Oh(Y{vPg7nf^UED^5M+hjla$uu(f~f=S)oN(LbIT( z^^C+bG$DZqR6Rj;-iCIltnLDNa+B{p!@3p0Nr?#MhoX-sRBMz}hJZrBJu8N4x{8Ff z57Js37WN&xpIVkwHRWvA+sp`>X>9iOFtJq?OT{$(LrfpoaE@4W*UT0!BGVbH7?tdP zS8hC?AC9jnc0z3xU4M(FYIF@xS+9j^{E9|(&dG`FC?9ysIL5BV6{Fci>tV1Hny0F@ z1t$x;gJeX`IqST<_u+vI;;{bDdSi+B(*OzWiA=@u;f*kQZM5{Bwdh$w_fcD--OF*x z*zf)KUBdX}Js+QhBG+?P>wc%t!5yk3fn#fO;;He*AHnb>Lm3j$1mx(2x5M5gZ2}@Xa_76H54o! z%t)v_t~biPpQ+pRac9YxXJjJ|%v?m?#j&-YfD{!p_ZmI4`kO}ARI3mq7wRrz1|;x& z&TsqD#(2xk_eF5k0+Zm;MJqncNys^^kqm3;6j?w_=~V3dQKPL&S%|!wI;HoqykoOl zrIs0zfr}7&JNK}A+=EsYR{x!n-f=XDoHEN(ez4MQqJ1>ziEN_kl1Z-eMOM$ZkGrL~ z0fD`!6Ly}%zv=<&ZJ{4*qs#~4P*YjDpwkR0K+

z7l(=RCuMO8-2aITdx{aMA&ZLpREMSeS2w15purz3EVkPKPU(k@+Hy z1617Xjd+*(%<{H(v167J{=SHu(1&VsR8r;H!$prcfUccPm#Yb47JW&*_xY{e4z+oRL9`TxmD>X;5@cRDLCPMh5gM{D@UqnO z#LAwnoRol}LeKF_?d6>bsU=K0sbVp2Di?0ViuLUor-Vh*o}fEOP71B13)&&wS_m1Z z&bAAkzuH*t7wl7T5UUn*Oa1uV<0KjTtK7T;GEUKCH5waTWeq$F_3f|`7;}sc2Bb*c z^&dj|ZM)9UiM2oG7!8np+ER05)mRb<5wsG>^~c5j1m`I6o%2?Z>n~h!{c@i3*4r8* zzRIgD9QA!^f(@-V2oZ`hBog37H-Kb$yFO%NOB#wREM^1RQE-%uK@JM7uQSq4fC(~ZtD_s|7oA+ z$^A(tZ)4BRp7-=PUspTA^#(SL9YYJy{i!!eORs4N@>;hA6c7G+m*ZEq_Ge@aD2Jbu zJ}z$>Vn3N&Yoo+*snX3SXgpDw82Jyu}~0{-tL-xSNH5xWO&iM-qxvmm9Gvr zQ}60Eu*e^|ybt21BT&=B8l|wm3f>b@(+jOAV%PV_$cwtR54Li1^y)}iZ}%tZ?&lGD z$n>-T!3?k-gLJe9|NO@@G@EsPd-KJSUI5t? zx+mAbeyy?3L&X9l*PP9FnY6tB%P}a=GR61+3uHdA4DwcfIM?{)c@e@4giLZij3Ic1 z;wl+t0j;{jQ`%1Khbn)m@-IZvlgG3nzkjo(AFTn(c$PC z6MW3GWH}=0o??|)md)F>RGYL9V`qBa<$JkczaN1u)<8wBk^1&IP>+O7glfa7b}5S5 zk@;n5&3LFdyqlY^Vh@>5%;$a}cm;@k1ZB~ zp28{}yq<@Qij3v2e>0Y}T4|sqQ`q{^u^M6DDRy0IR^)fJo14Nh<^=xuEWx?@UxUBtaH+}t+B7=vhvD^wZAwJVX$#4S~OTTnqWzfBz6PuZx6mxL#+ z$*qLU)*Uwb1%&n^AZmsZ#asrxsq38tpou8<6|0T6aCUdCU&lO_61Oq&o)M6{XX2aw z@GD0#kmg`)kUncg$w|d?(PgHvGW4TR=?(4zPUCq(@u%t*Jc7f8T+Yjz-l%(V5HkCr z7M07eF^uk>Z{bZH>0Z#Gi9`p{b*lWf&WLPAu#CG=-jWx&rdBQ;64jVnXF)G;+yL0} zumfmmyfQtK`k<`_x|af#^uzeEChywjhKUEBdgoZUU!Qz$GIRX224`dKk~RWhIMJvd`Sq+uMiCF>nC|t2h%-2l z+z^E&2#EAVR5&b~T4>^Wn)m;tIt`{$Jr?F1nu~tsUxyTxB1`~nB#|KwktV&pZDvkH zG5-ji%$N0}D}&+FH7~g#{zoMJav3OSBLO^H!LTBDm-gHiZZoKwz_TNh5YnE3-2M|$4m(xYBlI&M8+F~17TUEI9nsP=e0tp>=IqPy#?QHV zvux&~LxWs5^jR|9*#bw$4*kn^+D~&KJfR<+G`jvQOlAAR)XJ>`O-?wT9%vtio}PK2 zhtvMch^-JFbt)uHS3=3= zK3~`2BY*DQs5CL*2ChKF6oT4*Vk2ZU-({%m*%ch1dlzJb*JJELAP=dk-ZM2s81kJa z-BqCAEkAM7mCA zwD(yLcJ#3WQcA0aH1B>KGlSO;#|!b=gA*tjf7_eK1S5`SWZ?TFtF#W9{dg0TB@{UT zKvb3h4}*A(66jP6e!m3qn)qtJ2ubnM>3DC5*I4ZNrYV;WN!hRbiO#@M_Z*Zjjw2Oqj)nB0FRcxO^L>V>%l_)$NLVB1zuc~M0*wWYGece_G9 zM^A#0O}V*uxm8*C%W-+zghOkG^f)e=ebTybEPM3}#7m*(hW{Kf#c)7f8%fYA+)tfO zhg7I7h?$Bn)%lG>ePX@s-*x#lx##ZNv)MQRo?H+$jWL!KS?S%gzWe6gDXfJwB;$!r~#9%2H{gj=I+2jD}nb9O<=_ z#qGHfiL(3+CkN*9c+P$FBB2tqw~cp_&%8tiUt*euj8lmGcUZx{Nh?CeDMbDYpZkgT zI7aryA1lLpitS(>L2G+X}@CKOEP-a_$(D!b;9%(L7?mgWibSXh6*dfJl z01rtk0-J+%4_Qb*kIC?WO?yIA6jbFKg6GIqkS3T!*P>8j>}Zz7*(0>n~xCOOyh~-Zw)HD=?=n zsG(5VpU`BN$YA!v0ZsE-owJ7?wlurw<_rtx)CVN2GWW>I=9!-QcV>zmbKUFL5u@pK z4@;20Kr$3%e_?X4Y5io4_M+mv;(nj|7FSa{4l0Ds%bsKs{*q-4Z-#OV4be;FKq0^! z3gQk5=>je~zpq++Fmk+}xHSJDmoH&oWsjVp+3^?q-DM(1Sqh_3kc`+v+~cKRQt0fr zNyGofOMk2xytN!^vp1;Vih{gDbFtT%VgW^N!0;j@rJe2Hs2jsF%|SFa5F>yqeZ9gkxFFjAvC=oNJyGRr z?R{VPk4!Vsj>~M=DWWu1na)Iwi>;v_gkS+<^?Fk0R_&{X@w-D)bwX;d?`oLs0{Rky ztFxjhpE4?8-*9{w2MS^zMvKOl+OAKkSmXO{a5nS$bBC|O!^z1rV{+fGmY$Frf{1D5 z&CL^%Ot3-h5D^lLX=S91bQ&9Eb#R+@7<~Sf-H%Ge)I%bXyvpecxyn0I#+Sv8eM#Lb z^9j-Dy*qIR?4_wgd#IpXi02$_mx$QdLMN!7DKM!MA9`OQ>oY0Zs(df}>atLWQpsUB z_Q0M5MGBfVgq*R?U4R<{pQ2lk(Lx>det5NJsAHwH!9q^6Zm(nIqPfKF>d*I$zdVOL z|8r19gXkc}3WR-!9Mhu~ltc9`8ykf)wsN*%Pa@}_8s9X;W>$p5;H5$oHxSZll^nct zf*hQaR&B)h-0^#;0dquSfkbB4{Ha@0lnZV{#!SXv-YTR7LWY0T&OMYa#tdAiE^YQ9 z1yLN3*TvOj5#SPulF~ZQAODZvKk1dG2fx|qts4EFAF=-;Tn}O?d)9B^##5-*VaFdz zAthmk-Lqk>wX{bIK{G*5ccw-~`{XV&+*hIC{pl)0p92uHdVAf#F$?;TX__8Sgxnb9 zw*`%f*%8}`N(@o`v@uQjpnK}1xr4Ojn4oiQ3pzXl-n3JUR|xATBRki4p)G|_-|8+P z^}Y4XwWjJ=cf#;X$TnTOYj>mWAH)B+bj(3O2gMUM-2)+t2nbnF>QuGOnBd1=y@&SL z>6r~peyl0PPD!i~J>`M{&}O}1f+tkV3E_?G!R9ZN9Ef+2Q}TG8H-~5}y95T6$hHKm z%b$7pHfreAiFWtmBBn)X5IzL|eAg%qI&6yAvF;lXu9ViFzi~3dsqr<2pe}uxH-JdE z4Iht&sjptyV-Fq9@b0{tE+WJpdLZ|A0DgqfBZJ;e}BN(-!4a-p!C`%ay|%+rqKvPFs&AELD-u1-!T5)C5)jPi}SZafZsQs(P~y+`I1sY zM$S`R@>)}7+sWABYv}8*j;H6swoePXcSp@1dBfH|`d>Ux!Mf=x;5@Zt5&QW0GLge5 zs0@5CZr<+CIdgT>O|0YH_)D&6o)cUzq}YpiD1V=!_0Kc3;8|mkMW8v2lJ!S(H3o@& zZNX}eu9m!-)D8iPONcl$0?p6S!}I^6r}{xPUwGoV8{|G#?O&KAQMKsR_pd+ogb zFZSL%9_qf|AD@Y`lqJa$naZ9-_K=|xl2F-Wie$@L7>pT_rR)l&LZJ}Zx5(I+B-t~z znL)D8SjLB0{65|1ocnv7bD#4)=en=^T#tU=$Nld-jQPxa-mm@ndOnNx0jb(#c-t8V zBJYRgDv3OfFIfjJxH~v9-QYO!pqAU}8#kM#RHGQv#6Jmd{)O;HQ>Xz3Mav*9C|Te# zZ@2%{o1#LbY75Q>H&@S{oI^eHpeB-Y|BIGd<}16U9<_h5H=daI|3UaisxHP8GjLfy zA=%>XOJ;a?kZ$o~eTA6%dFue9(vfgMHG^B9B0J2*_=NE!{a-)kK}=Xe(E}Nksv&Sk zHce$%oTe&~!xDI5r(8w3=B1hCae}@JWs2J6RrNs_h z3<>=}gMO`ew%)@IoNG-NuM9N`Gl`Qbyzwlk+)ZZd0j53*IX4k<#)fj2UXNyD^r6KO z%t-5=mO#CzbU{hO!{;DKomwpFJF>+-}wW%mVJ>^*zO`A-K z%|y7JkJXykp1!cD=6JPO1gj{uod#L_@sA8(B+p1J58Nf#jq-~86YwrXlj>hD3Rl1I zq73kF@h%9p8QDC)D^-Cvp*Wz^?7Z}hl-M*MWHaqI2lsoJEVV4?uQ2#mSaysP?4c@b zAIb*5!sluoTanH0uJ!U@*;#Y5u}nb;^Nz#FOirP*KaZNxjeEIj?aJUIPsATc&>B<@ zFE5eIyD7W~Y)fZy71yy%O|sC8NQIHHwB5wLMsDW@zJ4s6DdUNcINlHnH8bK?!Y}yZW zv@*RbN?De_w#vpVlJ)9{$ms8jj!-TY-!`VI+&kCfP#kb`743)W7 z;{&nJj&pk--yeeP2FW^`Fb}P{+T9kgV56o;*a^t2rCD=eY0KPWslE$>D{?A_1LE%#_$sC&K-TvRJiDk zLy&E()nLKB!y;7MftmeQb3=XOWBQm-sev+B14ulr!swo?8g52_+g8kzT$%^)@qFLx zY+TA*A77t8WpiSGvrt>4#OY_H*4LQg)}iYg5MJmScBzwU0KB8v07ls~Z4x>>dw%#s zWK?Ev`eA*2d&}&~nkU7pBbOH&vfj%ZL0lMZSQt&0s*gGlP4tg0kCnUlG5F$g=d0p1 z=A*}CBIr}6dNkLxMb44+80;0)Gf1u$RkSdvEt$wA*m>*wX}R)&M>2c~Jc3uOEEVzK zN|nj6)<`;3IaQL>RfV=KA$L#QkOekr(`c`vY%h&LWMQQ@vN zYS!(%#2v+VN?r?vl&G%aWypp}5s6YY~ta@(`-cOV*v zAoFM;^e0&9$|LyA!;(bF_&U)ybL#H^^C^>R^0Dih{QIxVJuAK9?|wHSQ*@Wm{jf9$ zDFYC@T~vTW_gR05zqO{4k&85+Td_CSvRqx3{p?dbmwfv;m#1>%U2WJE#<$6GFe$uC z8YV7fG{xV?u-zDx-QduYJ==4os&CoXFgjT*JIvl*%}-x|yRrUC%*BXS2xR3cy1lRAJEO&mPuTEBb*C!yOC-z zitEX9vA{XzP#)j1)Ri5m(m_R8t`*NP?ndAac5uXSb48%!s1BIM3tDQHaYugp84q;yy`Kh#H3iF<}p=omu5(krhUFe@o@G+OmT&azuul# zSrzx6ol1EmJ*5_>&FMA&StBo#D;uoEqY_cU;IA0~@^n}<)w~xg8B(IkP@}HhcFuf6 zk*jE;1`1T_!-VZs=Q$uKJ&@vc6HAYKer320PQB=&Zn%*4r1*VwP|>b?gOM&=B1Avk zqYf`uZ`^0=E+>{QZQqgVUhPBGY}04V z;r!mS`u~j)#NDJT`P6)ZY-{gg`EF{Q((HMLpwjs0{Fku`9 z6GUk^yheUZ@+%2?RNhl$|wl2Du*nGTgCALCS`wO_VMYrI}Jr6l&gxF#B@3ui=2UNc-;#dVdYh zEW~!&K{(^`Bq(uEd=g}McpVD~mZM_OpD?8$5k~w7)`Ywgp?UNT(zsRSu9aSP#mnY4 z&+H`UA-VDs;#g6tC`xD$2ao&tIz0~F9&gEI5Jo6^zrst1 z1yn#Y21UfK!s12Pff5(eFeQVOTxw$wz!C9!>5WQ8o=dER%4agE;Hj^DE%dm*G|1t# z(Y{nhy*`TmB(l3-nM31zm?athP%>pwzq~r3fc2P}LeI%D#bcf^?pN4Oo)%7n05~cI z3)+5?B~31p5?)Js6U`DLdHl)(_2lTMZ+!u--{n-7j;Z+Yo))^Srx~h!#*HFO&&Rs; z;4FyyRMSPj*2&2=e^fA7+8Cwk^RVHcj%sK>Dm<02{J5)=^@6I86w`BH0lY%TfviH* z^;G=i;wyZY>}{l+WB*l?q~4Xc1Mj&LmfU#c%^v#IOpG>5jj!fTGch&8NTw?|1U{qX zIgFVxgyp;$(L``?OFeYg3emW?a`)#TgGcuSQ2Du?SD1FOW^(y~Um?qtPrYgpg?s95{M z>i5D7ss}2>o>d7+R8CY$*D`tOJ<=9p@`8jYK7!EZhcNiJ85I^SD^Wgk8M`!hJD)gT zBpwm%_S!Iivd1Iqu~w78nVyt<{v)@hynix#7lBJ&@$qn;LQSiwB9^L%I@7n#Bxl@U ze5#lsCD-vus%7>Taknk>f>@}~V^A!n2XzP7A{dWU;ayRe$k*lkj#O4u6!#Mouh>v) zxbK9s;k^XE;R*Y{aLQ|I?d}DmRS_TsZ|yR=+MdCV<{vR4#2$0eoaslw#Ro2CdaS1D z`K*3;b!Ebi>&O!1zBbH&(VS6*%BWy+)q;>KAK!e1oH8m?P3XL3v-lwfvg9Hf+bQ+K zO7-aN4vqT|?o1IuA{_kxOL8PG$|I0_#DS{qf)=j+s#IunbKp&XdEU{aS>wmcb~YKO zN>$_&PI6v+qISB0)D)Lkl{Co?5LUIq&BAPX*`iHt4m(Nl`w~ z+EjQRcZ`Z5=e?jC@rzTrG{+$Zk26TYM*46^L}qYc($LYtd1%3bDa1Z}pzMGY;@S4i z7e%D$kU-x*Iu-HB;6fRZs#^Kf!-ty+gPOK;UVfw%yK-e-AMFcYcj{4O=o!Chacr{<{h~|6N{xLly$n^e!hWDY#7dTpfFQqNmSS}zcKCMV%r zQrljAjDW?1`llF@9xQS~FS7m!?OJuD#Oms(cDc`KQ}%%VkYKj7Nd}O~R)2~ygsIw? zYy&^=B%n*OS361?U&FC2Dj8JpkcE}5o@vRbF%`QLw`Z{74(O(4wDw~fFQGM&GQ`%m zU=@IAGVgR3pH?{(UHKG;&UvHsbX2jusr2UHqebStv1}6SUj)n$JCJD{N*R=$?9O03 zU-`lGf*z{`Dnztw@wCNC>1_ z&(R0!FcE+7@td#diOvsUODGp0wX#%2dT!mn!cd^8M!5t& z7CcruRCih{Y53GW52|MM)dbe9s05DYBcPP+s|jIxX1UzM$+Fuy>+Kqj)e*;*opxFuF@%u(_2d9x$W zeo35Ijj6=!XjJ9##H3jP7Wi&L+d3_XX)8RH~Pyx+1p)w-4tEUwjQlo zT^6fGhjgf9Th1e=z*Z6qNNyS$%^1oMsq@AVCPzF8_fE}Ga&JF6`Zl&Df172*X? zi8EuUS4c_c+nNcSn-qKS9kI0YgZ1(@G*WU#TGRvci z>Zy3s!R!a;-|?&-3Dta=xICR?U=LM;YD2d8iK#SRIOQ=&vGHLFKfG9xZRtYTIK5tY z98r4iZVu{C=Gdz9iSV?nE{nZ2oc?#eXtR6;)f5mT2qGo~Wm|yq0AdMUE1bCNKUT8mR8+}N zJv>k&4vF8LpMO8U>&PDbUe68z!6UF^C<_=57)CsLc;ALy+K(Kp7qgX!@YpBBH|H5u z2A_j{ICHFx9M*m~m+1#48V&)e3Gya|X(vIcgFL>M?|+bO+fgY?-mkRH@0puTqEi3^50Ku>o7+{p2U<)s6hC4u&3Zn+)~K`x;Q&n`H+cYmjS3?yrd z9HfG~FpW~0;zZM2WECZ75K9ml6wm~972+mJg4iY>m$Dgmdp`cEvfc@~V;3TF4lKaR z!5u;tuRqPuFzPlw>4o?*J?P|5jJq=PMj#kA3 zQsy6w>C7~Z9Z0syH7NB=L08b8_+YIU_m`(Clkrz4w6k+*ph3qVNQP1OzzlE2;JNi` zc5S>q7=}N$@TuCFvt8ftl=>X({a71jh-tj#eoqk(kK})R0zSI;-9+o1%bLx1Zl04r zOj+S}8jH7vd?p3Cg7pKl?_e7J+2h6G(18-MEm7+>t>5?f~ zIiq9q+v+Zr>i1SJ2o_$!lqSn>{~mfyASS)*+;tzq!Mg)V*vfO!uRw0T>}GCc@9t_S8sSJG>#tSuOWuO z&HlAJ`1|Xbe=+nKC1*6BT)c8k9h^U$CH4?&L^;LHot}Ghs*7J79#r;boObW`=Kz0& z@S#s&8bKY<>7@;0>aIELw$xXa%sDLSA{Wl0%{shdxCIey$Ny+}W)FVS4itfE^L`mc zK9;T#5cU=fN1KyPE8&=ZI}rPfkid${7>~DoG!4f9B8MwO_o^>O%$q;y>>hp%QPwCm z=(SrflAt6^IM&^G6iX=sq5}x+(PV6=h+1~ocTh)O1rWsA)5*l$xx&~>>s;}H!8yD?SU*2N#oe=k_W zF5^O+WkEI-9qVduz@7^GF{8T66ixB|athQc@}pPQsS*)jv#?RMPKdxIc(!;UO&EQ+ z-*bwm;FUxKOg1$zNl3^$LH~?1)78JJVzd$Ef|uiEdS*}*{pN{8lnJkTwg(R z^xlVbiS38SE{K3yZ{H9kOE*I&;2?gK5^mI@C98kstd^r*S@j9%x`dJ>y%$_;ZQ9z9 z0v%mK`bl*^YZK3}G3((K0xej8a95ifd_2L$b=7(xFIT+<*JA3z-D1fxcc<+~{RD6_ zaT#Ptlw#_H;|qoYm5Y{)nQ`{#t5nf^#m`NpAhKTK)!H@#Thgqv#k|mEILSiL_Y^6f zAiQ6aW|+QASQa`X!B@f>Nnhfb1|suOin6Ai5?4gEeeNKoQxzRh>OBsEGIu$Lg0OGSPp8L-{bvL%D&uCS-a%U^NKIK!j zNOqK`HmGS>4qf9bB-gvGe4}au9O82>cQm~N3Rnm0XP#E*94cN{ynbDN@!~d^y}U!W zr`=AbYNDIfxgOE9hy{{^j*TGAhW&F#jPEgLVmkW@ok)+s@@+8FoTzdvJa`G$7 z&&KEbbRo>hah%)3B&qf&Wr)=`DM9CkX=)neGyM$~{}j|Ckeff08p{xH%^-y%^S36x zBQ?t;QUf%6tg>KA``X{>CC{Tu4V0kd;l-=utHk!jUaHwQ8r!hk>>=|!q9P&IUsrLf zvdeEZK-!z%q@eVU#_&pJyxj52Xj8wc#{w4V#G3-9^N>ugIynBF`3K?2(kK1% z(OkGCn$FwGw4T@|pX>4nj9B*zKNgU<3&UWWz&%DcWe71ZwsqbfYONN-!zt%4XrEz` z1|DmR$bg5DPEw7a*;4iA)r+FaYlA4DI=bY*z4dhYg@Ys2q2f=O+GY&I+J*U;Z-CTG z>w6jO;wpJ8i=Je`^<-J;suF`!1KPtBA{e(&d_9$*46!Ojx|m+JTu8H#TTINgvd#?> zACAcwK|2T=Q7{0_+#S16zr4rbi?KG>g<*{S3LwSOA{ULrv`$7%9O~cTKx-@rMaSy+ zo;kVa*m!~{@BKsrGhM;GkIO-=$>HB=F8}Rx6jY1^q^m&k|1!#hYT&M96W8t?i0Avz zwBi9c(b2_i`dRGTWN_t{r@dC|T86*c=obYA=z9EO4&IZOrc8fX$A&jPG=~|%eUHtF z4q8BHovzDMEg7v>x#ht+WBCZO0k(qTfmc*ku#nlqfZL%A=`YJu2i^IvRS&G6xoTWn zVX*iB3_W*GsP3yd{=P%s{DNI|rwLT9HRD?tWdm3sv;c)<8`DToc_fY~v=^+fmSPli zZkS&PIG5G#!UyZ3<9ND1K8uoxfNaXe9-6pMJ%edfFNcL@2=!m-Lzx!+@O(Y$^5IUR zv081Mu~Nys$6fNTP6a$T-4UE6p35{HKho0(Cz;A|076udExlLM<_!{I{g|q&5^Z_2 z_O`C6rprB@8yycLO(URQU{numU)+IEy%~e`5PFu9acD-lV{2Ou=eu_}=TwUNo0q#k zE@z1MA8%DK;E{IxTv9Xs(MoGsfddlwb_x8N%&TNfD@>$|+>93=e_?mDZuQon(nAle zPyJ)q_Mo+m%rm*+M5keR9bzy;8kwa%MM>^t0f_jt10kMF^mfa&o0ym{gX6^GtgaJho|G9x zURS(v3-M*-DH2__)-E{EXBy_&78_rwvddkqv~W&( zI71#}YbgPBb)MtDKje?%H!H2G8oak&8?pDPVi#L1ty-YuM)Z*h8g2Zt5d=B81TthW z&jXYAp6FqiAly+Vqg>Y;jVr8(E!?(3%&DGi`Owm*kezs|QtL*e%}sNryAbb5QVXUk ze(B7Vk*W#(m9I_g@wVQ5FkITuD|avVn_nz-T<*+%mppuSPeT`|U9`Ls4}*9_E;_wO z_h5I^;FKDCZ{fXhiHIJGAh{G6k=X>7DVaUOTYjaC%A&Ot2=~;1cn!-GnP^zC^H7|5a5uY&vW3V5kDx%H~c-HTAzI@W-m^++m zV^r7ww)|t&i2}B`rFzq3md(U>yExTOAK6O|W3Y8ooq!ZdGm)=AmKIE$YYhjV7?DANHcYhup% z%^r-45$2b8M(|9rz$9o`)4&p`i#Uq1rYFi}B~32%A?7G5RlSyWKbE(~d6Yhvba4tk z;b?f`9#f!Txod9`6X;Fs&?hd`3y>u~zH<(|H?5~k7`XRQ)>LnEqU%6u`U9yuo1jK- z1^uHICruydp{lBeXbJs*kES1C^cPpF>_F~BO#0~7e)d%G*PS}^#W)Az@UxJpX8}2R zSV%SqQbudWKFm$m;d-eI)6mJK%1xUkT;Gs9H8XJl)vMNcdMofoObBX*s?2BunKZ#X zaMZ~ds3A^9!fukD9Ga-|m&l@*+&p(AgKP=ipeLe!&_h;l@%i%vL4*^bJMzTr6_qe&GiN)D28&L{t~4}z<%5Q zsURk-srb{hUYZY^l=6FBS7`KEsz}kd?e`OqKU&Iq7##0Joxt2&8SSBfg}QGrN!+Q< zV$#L4$a(u^PMyEsXPsTnC&hNFOCKjZk>9c^J`^GffQ>NbS$Y{wFVAH{eV`I`mJE+t z!mS_@<|!pw8D2uJ+PA8oF=u^Z3lsQOdl6c3t(fJ<%90RO`3Ly@8+QW^MdKWbS1JsH z3KdQFKQxM6bgAD zj8L6>0ZI|L@!@Uw#u?XHlm3$y*8Q1#1jo9H^-lbl=dLk+rZ?6jngrcWBn1)N<3qG| z12zPaEH~jqmvoV^ivywpGD;ELC%J@gzRZS`jzee(7(fEPWUqM(r;bu*(P8wfG+Qu6 z`@Jb}RZkoWTON>ccXqVO|Be%=ZyHX{XpTM524mkSdQf29WiYrdxV{@?r>zRr-*@~I zm!OHx&r*GcKNNEd14u1rlv@Tb`ox=4cij}Z?>~1fF~4=R@kYBJFH>6ub_D$1aH`&pnzV$6`!EVT%<71vEhg>pK*(a)x>K*$4p$H=g`TGJ%1`J@lC=HcH)kYwlG)<)oXg=!B(6u30;Oq@L5K8!)(h^+E#_} zZXf zoMcAd3EE=aicrXo@zN1J4S}fvA{ao%zZuKs0_NZAT#yDzpz>>%?cmmqzS}`T$N@3u zGQqk^_Dl%EyFpMQM7?OFmcj(|=hM8(=-F5ht#`oY`hs6-@BG}GoKIup#=(4(N8cu$ znr;`?<<38EiC(gpF%5-oOa3vx`Crb5k*dw{k=XJtu~cIuh|IIo1Tx;bX0ZFZF-JCQ z!5)rvJ8J#V_d6$GNy-0`9qG=nf85Xh$FcKJ693W`l^bOVRFNew0ShFRAh&Lhp}vWU z1;eB{S*HUwHO?MKlFo`sGVdzrihZQx-U8t~i{JrMl$&+B*)Wb~g2274OZI}k%0?m+ zgvhg_6Sg6VkIQh;JkaD(7%I&JRh5enN40Lj^4FE1%j^GJ5A{EP{9ocPMc6})!MYs^XN?)KW@YhS#eEBAiw z)8p{h@6ApO%@4JD*uXUKl_eS7gDGEWLg0eV4QZYuJvaMUSN?V7%S%6x#fkB%rLp&~ zK4i%q+1P-9^S`41C2#QG-sWH8I{cr9fPPQsre~o!ffX>4G4>#BTIYhMj9D5fu{Oqh zWk((|9Xo9$c;2%l=b??8FhA2UL?rcBKRFg@(n@2-UKIaZS3U%Log(2SU)y(v)ynIy z?l(fE<5r)tmSb zdY^?k4A|e1$+e%JjJegr#OSm;G)z;(WA?jk*NGe%yqu?$R^D6N=Er;uA@T#A;IiHgD-ZE*6jg!}5T)k#spd#bQ~j|j`a2rfnhLW@wN5fRW~;9DMDLav zQtO)c`#$QUyxmW{F3Uq$YX8Yv@-OZS32)}57p7940O4T5u4T)?;8M=dU=4m^6CM5> z6{IstU8(uLmF_82c?TTIul(W>k;CtJ%%|`U=5T>diXgZ$DMQhxzwwR7fx=Zs$s!e` zXe$ykj;}%@Nmf}p?W)(ogH@ru7tV2vTFIYH+>Ap)Ho?*M-{Q*t8~618;v2YFftyrw z7t8^S^A02mQ&xx#QSMe)AkaUI8NM4!qy!b)9)Nm38M`(1OfvAD$f-xa{LwLsNu6kc z>@hf#%M=Fal1B-W?Jp%whqd-2C%Rfv)Xk<`Hj_rhqK>`Tx76||JM{EEsJU>cF2uk5 zR}S-kVpbxIiL5_KlP5dHRjfoz93s~z<^peT@M@%g?`h} zQDdg2d;dfmi@9Y~_|*`MxjU&=$g3B=Zd5DhiS`{ZLh7e<+0$Bd#iGl#@YxB|oO)-q z&rJN<#nqpdy)ZR04kFhmnZt1Bq9}02nM!KE*vk-b*+1L#VT^fev-|rIqwep;pp5G6 zCW)q30o{*R)la!K%p1FT1Ut=& z@DXENS)Veo>dv_;6qU3Rk$AmDpVNQO>FEfT1f5LI_y2Sff9WYPC4ZHx=Qo;?MfWsq zrt@L=&L2}>H|Mc=sSAbGs$R_aC>%VgNXq2Q|FzExTB;}9TYjbOeO!)}bDhi(z(sUqSKK#+Qi4@pi_k&T|LK$tEtOMbsVypWUUrhu><2MJSz zZ0z$@_B>6o{k%88ZS|YagA?aJ?Hc@79|Sfkj7VLScv8<26B*VH+s`;T z0&dH5ZtOs|4$|NU^l$ZJknGFmu~*LI3To-OZ;gFB7V@j-WYG6krjJg3ha@P*WL$)h zibjNpJ?$&e>72tG@wmPCS(;3@b+b)cmvQ;(?dT3&-*(e1&rn}b;7K|jSKdK1|lnw1*b z%|6b*{MiYKJ>2JhBIe+J!&7HM=AD^3u}*MY7{Ox6fG~~#w>{ZCHUtaSZO_kD9bi-`5rO(Gmh2NZ6){Q=o7jO$Oh+_cB>E z%8Iip7iZm;YN40AcAaC4)IPB46OhUI;n%hvXz7L0w%$`W4qr?vxQ5@}-=PH#L2M}2 zpxhk{-D!7l2$bifS`BY#1pAx0UkUJyj+#jnvVYxHZ+#4ZJkGm1$v~|EA~O7|S69F9 z2mZTn5|jXE#oWT~1(*5O$dXeNsYJTJ=|H_`$E}*;55CkhZ|2Sy@C(-q+*I7;UJ@yX zE)V>{SpjslXbGfU+|3V&Qni4%sfMP}D`d}y1z$$HWTo1Qt>CcQG=DX&uuYTGED<u%cyk^S-WzBV9Ac8cLi2;)%6m1_#%V6+=`!M835cn_nN7yWI`5g_rjt~P? zY-Nahg%mDq6)Nj{WFxZgB!=xkg4rQde}W|Fm?J@SJL;gz)D9#9w;uTB@jYt=uPZhI z7Av|ZoSu%|@(K)P?EocJzx;y?Q&2wnwKvo2i`O1FUHK#=%mWn^TuR*dOG_B`0PBii zMJn2s((~&Aa*$cqGptkGPr7lu?JPTMo&AMab!)mB@I zby@2Em{;Lp@9bPp+1eNcFSkFbqVW#5EiVpXOD1D*vh5CrxlhO9TJkE@v__P_m43F_ zfu#C?yotscsfTWv_#r>Rfm84fIN`gU)(#{~jqJ(b04Y?C_0m(LcOV%)RFL+~wp4C= zU(OCBjDL%TrrFg#x$Yz)3nC1`MXG6QcqtjPwv6z?ydUAG@3{iL98C*jh&WEIY%9j| zflDOcb|9;=un{D7Yg4WoK71#d5hjhnw=#5gAY2OMO}b!vG=_8sv&9#`ScjldQ^#Nn zmAX3+ohMilNF8zn{GFd6sf?JZRxF?br$y^g?Bp21UCUe>qsMk2nV%5ADNF*U`Ewn} zvUkCoI7t(NF?6%2a{N0Gt(i@h?WIJ_S4TOT0QUd#9mD8;9oQcVsf>MyX&7lu4uFI!Jda3iDk7Ghn{5PV&3l?w<+jmQ4@q6wa7n8OaFqa5Q1GOEpv z)q{BZe;(LB5A5Gt(LZn4KTYzVKJ1@P|DRFvKaG;ClccN0BDp8P#i3}@8RaX+GnAi+ zg&V`RYx&UB^@p{=K7R?${`I@S-SZUGp5YA{3>QtWug;j%WA*~V-x3&LXzmlp)~^Oz z(l06c=wtTkN_2&OIc+fBpXJ%jyR1Q%Wc3H)kp>26mcg{!Z{I9f2iF&7&GQXYR1*t^ zHdXoxN{oc|z01lw`Xo+=WlyQbRU=zIXMta*6h=xmcFEz0lT))TqZu)Ql@Gm1A6y=i zuJPr0NQP_nJQn9QWV~D1%BF#|1)vDZ@*m53IqwqTQOn@c=~}CZY?*Grx)I-|=|js~ z{NPY(m&WSr5V#na+C7Is>$s@8G%GUZk(RtwRTSZ?tiwkYe;-BhnVEs2XHKU-Km9iA zqIRO;#S}BsI+h_orC5VS2^8;1J8+J{3&AwQKnnJCZA}jFXLZIjg>V55D7zr$f{@Z6crRhZ7hwjpnBMrSG zYyirHkH>5}MASUXyOb+xDih*U&Lc@aZ&_H5dxx@uvxR0jNiT!nO$8%>@+56}2+vtZ zMsgON&@FvU#LcrNhAdtjR}*tk6a2CV0=IPj$8@uxrHW#A7KHT83|OnXU^+>A(Ag#| z;ph%*xqmd)#hAs7AwiC8e2E|oeu*zM%xQJWH4w{ii8ZxBIz1AtG333$8}nXgIzJ@n z^#9?mkj>$xq}wbe51@>vqN`!J*)%-g}M`@3O!5d_4z~ zDok468h^R_o`~9?Vwi&4E0OEQ2j-G~Fpl;WR_v9or>X9byiO54!@8&^TH$0Jt4{LD ze_!|I^PZcJJ}nen=FEC9)U5#rWIhBcrUNi?WDBB1GX6w4htt)%K`nOV>viopjc2gt z)GNXdO)|Ee28v+;=Y;tY8ThrB9K3jsu_e5bvI7y_j&5dX#`BsY@0evh46QYEDf(`h zg>XR)#dU2Q5`10#P%`vdk~FWny8Q7^7j&3?nJpfm)3pAW$^Qe9`%H~m`_O}MQWZlJ zWegkzt+v?)v_gW-m;k);WcK);R}U`*>@5zkDXBkIv-!p*?#UlsLh{0jz^5Zo=RuOj zs}?(uzSAHzMGu3u7d%Y8R@gSF54-LNvY%!BC-(+@7SnhXJ3XrjvgO$8ZZck+kYd{S zgZ}fchk0l@G0Q-7w;6NUfe@CwOim{JaQox~OgUdJ_it>^wH{MODo zyu7BO@;R#YV)4<2)vdY+R9!~!RuYI)Cg`Je46$FL{&NlpMPzvGM76jWUFx7ad zC;Yvrp6XCsLW;0h!)-~Lury63RVdQ3PV-0oJMtWmKFN8zzhli>0MPI;Kzz|KKuz5X z1}SNow}9wikPX_HWS;fTRw(wIUW7Y4p|kVY(1keR^w|#X>2!n6qAtW;YTzLmKdqQ- zlmFyG{M~KAKhoqAHhxgAj5u%{Gas_4^w00J^k7x6Ajxb=@AVOc;R(%j5hW zS5&KMny|S4u}&<@Uh)(}-@nC}dQcfd7flI)gPfT7SlFr(%5(=(#32JVQ?8q^%q#!9 zxbT;Yz%ngK;}K$9zS|0?h#@d@O-&(iyMNGF5z4xcy_k7_C|4kaSbvbl2Uw8K>D7PL zIsk}XCVk1sv#WpQyF01a%IOBwp*m>~FC=TmL>`jKiTjP~V`_$b9MfA&Cc~NIfkcCT z`uf|$pXX6<<*lqKu5gH4(a(^kn>rGx7Bu2p|rpJs01mAoyLixpYd|X=!hEAj5a5?W`+%v2t{A*sUR_U*^7w z_~Q{HoA}LUM3l%0AO|1MJnDj$BG9bw@Nf^##QrdJG{CQ2Sh-6VHwIfFaaezsS|t;=rQmJU6zvq=1C=C^xd%c1JzmeA8#=S(i-Z|8@kp3!KAkqi+-56RepoV(~f z_T_E!8X`?g9smQBSZ(7Z1=7C*S$`?BA&jN>r2-CHqCi$Od>Ru6TJ9ibz2>l)Z-&;N z=ft3MAVb1M%$AD#IYt{DY>(DcCG1>3m`Cmw$ zp`d_OWULk+NKX5b>|_f}76NY#$V>g#Es^MDNy~c)jtaZYT`Xa;`VUp|QDHC0AzT5! z{fYi7$jM)${@?hGT@fE&e*+$$8$fB~9}SYd(&r|VeArsrBr7iX=~ouwa5&0>+|FRj zATL<=S8(mq%^g9s4aKE;*O{k{KkED-^E@DO{N}>ios;uj=^-#3O!MpxWPTL0O5TAi z9|B9FwH?TYJ9Y~+kWCJ-*4f&DY*$Xw>M+y@u%trmfHdl!3_Ccz87!8xFpNU5w=D-_ z5Ppt?;5!iWQE(;zHvPT6g5?$$c_7%9cOWz1pk+)6EZP2V zA8ikSQglMZSchhdZKtq$R-ChX#Q+yaeb!wf-$$F>f6Mw63qyO=U-$Fdut(9+Bn5wC1eD}L|&#HXd4&60b;4o1W4&yD;I zKJfFYlEk~~8ZSo1MqDF=h=zR)(F?pcf42OfzQ)^pK1CJ;G%EeTei7>g4_VLKFVlyO z2c5SC(>cg159*Itj=9=#4;?VAtavecae+5+DFNagX`TE7h$Ie+a<4_U73E8s$gMkSkYVm&MAM<$bAaNF@elqX;i_*KP$Afksa?Yj<}?E zz#V6}H1fCxZ9CHVMY{dKMYHt7@Bo>L_Y<{(gPkSiKEUndqppgNV;p_oS=6U~@9VL# z`HU;FBzt#hu`he-BnlB#*L zl@|AMWcgMArzrfT^Udkh(znOs)3$SQb5>`3`yIcVldD?eA*j`z4g4YbkraOZlh%qxlo5uwlkm}()4cS`n~`3T z!D96uvN>xS>`PkC|LSd;OUdc?bCG4HB%{L5vF}{W$=AnP>~G{JL9Ru#ahdiHFIegw zyzHA3X*}`Bh}L6XZy3`&Gf;Lhkv@1~ir+V=Jy_M&=+oHh@AJd|mX@f1y%z^GI0Sr+ zFY3(qr2E6`yfDpF?D~`7$5tb|IRy zcCk(x-+g&}pgV{w?NfMEV%2igiP+>Taial0AZLhwG%zXD84%!mkzJS7^`dFYIc_N@ z{Do{Ue$jYpurYE(bUl+3#t;mnfTo{?1e|st50WEyAcsKT`#*h=i<6#N2WC9Xi}dHm zRM2MtV^Z5nivpD5$uTFw-M_qH@QvO(a{?KqdOJDM*xr~Iig%-`(Ol@I0vJd17dQvz zc29)A`XQYGCLBms2nQrnUQWr%GzGRdyerxT7`BRV zRUkiGOtAHf^;c{OI_%r*NO168UFAxduYK++4T7G;deJ9P^1$KzQKTGYksd~4!9NpE zuo(sVwMmZ2lyKzu)5V8r3fi4!P96cVyCK2Bc+3M>Noq?EmktpdImrgHQAF$2>_996 z-T-%v(vJn@$q3?_HS-z|v+_c*VB@cm3JQw!)}a(ZT<^u^9Z2k7+JS%j>h2{W3L4l; z==!l?%!S%d;!F+Nzegilj>Agl`2LQgee!y<+K1_XTgm*Uvi?0`q%)hlWlaIp$Za{6 z;&A^fv{P#UN`#dnmJkGE+*D)Qa12eFVC1P2d46zGKaIom>fPd4v3&&(-urbZ)O<>> zx0Yj#bwPP#olmtxoVE?Fe^2KdR%_qepe(tZ_R#r{Z`6DwaoDt^fkb*-4_!n~4aq)3g(MW&L-s8R5gA5Gc8U;T2$3~gc4Oa{?0d}E z_ZiC=%uL_+{rQ~l`P}Ee&wb8)-{v=qbd-QxeNf+WGV%>Z@C~;Zu^|EeAq{$U@ zTy|MjqnULzia%*<_=t&fRd%4*q*`5?z-A^qq9A}1!<4@;LEA^&9eV1l?jq%^)j1WM zT5|lHI$u3q1r^A8kC%|ksd8Ip-3Ib`?NW6|=10>HJM!B}+!jp=itmNV*erd{(v^-i z!AQJo#N>l=&5izDxh)O=Xtqp5HQy%~VGnmS2Om4)-wV*)uaf-ThDL zqPH54R7~_oQ;t@AL=fDBAE4Gc@E~AXCa(K8hh->IxNWAoQ@Igb`8~%66};x zTzY(eeZrKhY^SIQyM+lX4<<7-;b@ILLN$bBk@sn?t;d1jeeqvCH!1rAodeo(2p2;> z_sbvV_sIuWn0+#7*6H|?v+5R_HG)g7^jLEmNAbI!CEDUzW3e%+IMsYR2M>%3>#Dg; zzcV{blvPJ{k{<5X37Sp{cvh(8T3@1|qQE4iDWH>HYM|+f+8a6y!P@GDVgw1N6*4(R z&v6dh_;gg{c8-ouPdbSjR9fi*C=T+PJuW1hFxOlK4&GHXJ5+?O8<}`BjCBZ1KW#Yp zW>IW{{G_6HM_-KMk_Np@FN3G2K2UBhQM_?Jftu-V^WDzu`)OwHIOlpq?R(Uo?ha_J zjtp*!5C$=j=J$SLh{h7EWdZr0+!x-N3VNEY#Y>_L5Rz`? z3aP)xRRsesN&vtobd)$k@Y1X1hob*OH&Fvu^AC`^+PGB0&>u03KR0Us+n=Kjh4YZ| z>bQvZgeDxzY}V_iQD-OC5Oj}I2JfXD_Va$Nv`$EKrxhxye|ttcXH$pGP0}Ci^m}IF zx1IL?<`-~~32J;Nf%52@(#Q|c*%}+piJk0gWUhgCrmHbA`)biy`xez#Gnw#wKz|5Ec#C`ts zz16a^q_$jrHW{qvHN|YPt^$Jl&LIf`&}6jB*(vyKQly`N6NE|u2)HW5=|`8fhQQU% z^M8G`q2psQ$MrGwbTSP8ov?5uXV!p5PG-#}fB%r>9p(aVhMc-SnyAvy%ls+A%~GI% z;*8IA0KGlKE@9%EkYRP4VJ)cBH4Yhs81;C{4R+toc3w-#VwI!*ip@yMGZ=rlKidfX zSER{L^DnC%WX=F(BWe$bmZ1Xc#J|<3eVR1ua&mXgFjZ5yH+@<%or#ZH{-yiSmB-4* z{r{%ZA2-{PE21M44s7)=j6Xoj4qEaFLj*sk9W|gpEMh+1OP*ZMjv&Wm`lSQNqzce3 zo<{H?m|X3MQMj4C!$NXAZ%#qUQseLO51>XkwLq5z(#J{kx+`Seg?@DeDwN>k9B{ej zn{V>|%gu9jYV<)dJ$hqSwD z?|TJj0eup5qoawUWR+>ib)3Kpl&$p_Y=$I%<+;$uLpLj9ZR7Wgow+o!Jt;XCROR*G zEe|)^D3tm^O%A44$#&NyApw0}D=?`cb2>nNn_5=)`aadpodss5T==35H8ouv_q zJ=nDg&1GkUJBpgfxxPy4*;Ki>B`tEU_R!w5P7I(&gs@iRrsrUC@bfJoSEa%N)&lN8 zioHX0+)DvS@H}mDJYzN?(8A`kUq1{iBE=yin(7PF5n<{Nx)Gcd{<6(@t@XNO_p<8k z)d6A@Wl-*L(-Sj0o$w^AZlTv=FU*-95xuiaGRt2n=yzl}J-PY9va}Mi zS^(m_)T-V_y1Lm|(tPn9X5@n4&{9UV01!Sjv$NHeb^Wfadnxar=)F^K%=5~ojq4Dz;ejhXA|CzuQblV|>TD7g6~s@M@DzN;ENdCObk%wih_wKKx8yu|OD|1OFrBpmy^O5?0Xp|sLE5{}OKaFxQ|H^wHHpdF>DDb&tS4lx+UG^R6!e`= zT5@qZy^NLeDfsH^Ja5;)8m?+^x+%!Q?5eBV!qWwh&Lx&S?=rLPewUGnNGu#x9G8iy zzr&2b1MUJbBm+eaGvGf8TS5>z8o%|C)5eq)~7c;9?g%`Pb(8Y&) zlEwvBWsPGOG`?%AMn_Cm*{P$&4rIlE-V9?9;5zuGUj@|3-ImHHv!-wZWGA1hDS*rz z$&a?TxE!h$U(T7Ig`e)Xh?9o|CzhpTISD=ynf2sfC)2UcB91}s3U#zJX0q>et!(J4 z7xIzCx;CNv7n;$o2WA{$;+k^r+1<1F$U`5aGYJ}qMY}Y=Z{I0RkhETAKR}w;$<+z- zSA`#)Y6a;g7rxNstiJG;Ihj6Yy-QUqymYEZH7V_k{h;I8QC2I#8|fqLCmQjb+*!a+ zr z;WeEEW_s&k{BfRY!@l?V{;rD;XI+)JH2+l=!7FQn^2m~SjFgzq2IckY6&M+(H1YDe zb8PK?g}~+u8szlL@r+AZN8_pBl(t})3u^L`GqxZZFimsV&MmT~LCydnnoLQ&m(}@g z{$aScOdrFoOO%r+yR9*oS@1eeV(pMR2xv|jHXU9Ly^Ny_>rqvY5?=1U)~GJQs=*(N|!X74(S8 zls6t@w_ZLMVKo~SNoC&sEjcpwse5va(Y#=q49DHz`VR3jkQ*Rh`O2~(btKxy9dzUj zXOB0)u0KAn9b88f)>mSYL|y3*;^jNK`2=d1=npL==;CtG+HGL4SSXaqYwI?S9~rpA z-s|#xGN#`)q=-+1+?9iL1Z`yhdg1;qT=*w1QpDU~Zq`1I<9mB6lZ#7PU+0Q4b@l~+ zg~3^|^%-mw-VsaALL*;6x#DjZ4R1~AR5MI>xous(o9gm5geE6dkfh$h_Xe!>b6ELr z57$XqMX)9C>WR8FpJwn~E>@AftWz^QM5 zUUxnLmg7FX4mdog=gjUMYJB`&xq?=t*eWX}x*l8ys-B*u0Kw(c^7+I_%$Af*Yk zEnC`;1QQU?$k9>7PLkO_KZ}36dJ67y@f%K9HT=`z-5xL|E7;}gTCLf5!G;cw#W*4}N~r{NYF95EGR6dcvtADI+Ao z$n5GXCkFl_XG*POYr2(v)04-LZ@Bx;--LY!Obs{`;Sx0|v~Nb!-?zR5Selo)tAYbw z|7B^^wo7GMEuw@987fl2jwIk%Te$d}muw6WVs2rH-}!OxB5f;h2Ej>Yt6=ek6p5xs ze9_)2ra`JA94=-e!?U zk*B~>aE`~GK(L?@dsBHrrVTeef!2E-UUat5W(RzgVo$oEaDs7mbb|tjWw{XfaO8x- z#!z<5_sc+o@1@d_&ErMxZQZHd&ENi2smg!b8KM?DPMaL;k{kcSAnouLGGRbbj>IsGLw%nvYU)u z^lke2yVECz$aecdlADn6=XNB@e8g5KARXr($=ZR{$ReMRy9Qy)Tuwhgeg?!#_%c`m z3Eh1Zgf~n00a98sKiWp2)yh5-`?k*5O~HzH`wpoA=4B=p5Wgz^GzKqsD4G=i3Gq#C zluRrn+}m(ONFFsM?1R^9CCM*FQ2Psr#waJ<1LSwk|BUtjg?UB$$*oMl5mazzKF@Qj zU1?@#8!sM*T&~R>YG2h{e9GuD7q$4*f{$E8_dIs0$H(SYBS zQB?Zl{ht2nmnk^S?QcB8GUAo_Lt=>{$OraQFUwQJ;{=N5 z3c3~_r@RVViz^eCTFm&~n9bC*#d&qneTjD8;W?w+Bnhyx}Z zp`ZhkIWZz*$tor_Z{ka+l2SJWUio;M1H*JhoZk*%GbBcBs0-9ag%_25rvGVw^ z0Yn*;5r~dy4=>yVyJtyyOXKV6oU1V^Y;&G83^KqMXa1RW{rN3S-#d`Opg@1h`u@eU zU#EK_seJ54{1v3}Dum=C6}S#ot2~!H06XOjKap?bBr!f_JlP_A0n>l);D7^BqfA-&5-M*QHPgvhucx}c0oYZ0)%81 zH5muK?`v_y32-dAL_ls`WDF=ARA@-+N4#Upf5O`ZQvvxVxpU^tkO&yF!U-JhZ1Ps1 zg4pba(geZXERJnJ(UM*$r>e%EuX^tOJ66J7v`G@fpKVaYt%%#fml0@>q*fMz)Gyhx zuR|#?iOHKKK-g%gzgB+UP<;Hsv@JL~i&2qJSPVD-6xwb&6skd{^7qgzI4pSjx9#7c!!r zb)4$;v*Ex#>7BNxT>hsDUT}h+EYNHcAWR+(Bhb>{M8y0!y|&rtTFOHeE((l2KIb=} z&ThdcJO73DJeiOR7KSjwT(rkQBqWf`|1xH7zbV%YhUUl4o9S2EfyhE(Phj4G$CF-j zO*{uW+|CMYxMIQh=wc%%3t@9)PWS$d>qmg)OLz1n98Km=E%(48 zPr>I01wdHitrQR^_yM{d(g2{us-N)ba}*Qw&aZyLZ?66SeJABB0-*KfGXZpMc#2wp zRMj*qPyv#g6kvq8gJc=lelUO{z`sBdcH`WC3PpI^H9;n<&@8}Z7+fb!Fq(Dt^g*2^ zLaUq<9oa;<`Haq~g%HavLXLHCoae0*2g;O!b^p)~=iqzW#GxS^8XRhr5P9syP|Vm} zN|DF;h0bZU#TB8L?-;4L%a%`8E?ns>{xTbIk|(m1LImVZ-PSkJHmkNi`iTP->L*)P zih3(OX+MHoVUHdC)a4=XuP6;%Ir>0LL{=LTAoUHz&^9YWm?R ziTYTqRXa~W8ns8#H|j4$N>Tv%81d8)by#n#$DBE5m%xN5aN&4EC!Y_d+<8lNa9*~T zO)cSNz=je;?e~Vb;TyOu{8<78ZgsCl*C(|P4P)m$z>_ayy8~KD0-Payg4gc2M^C5onnV5n|G;X_`Jn_e5X~2=f1e)i>a3_AfIHwU=2TYtz^A&Jp5&f zaswn8B=@I|BxO7S@rfugcr~}8j(N<>a`2ANO5z3_#$5A+V=C%uI}ZhXYR=Q|aYE{u zk3V|=1)JVbdbW)Pz)4>4hg~M~c!|1<&CNw-VUG+0^U3U9Bb%DKjfo%Tm95v+5^UGK zOkd?}Klgs@cSOHidB%bBYBEi4e_S76Q~3R)F+nHvdL4xOD7YPmhLiMnf>f(JVLs-3 zBnl%#DjfjgMPlk^$qo)#wj#%V`;`9EWA;Z3FIWhtf(nM&YX{3A*1NC;x>CJaf*lxN zefl~s0c}+(oq5;lRGzod`mh=Tc#w7Xe_iGW=@U<6bs<%r$vBsSzxUY&WO zbT013eB*C@&HufE{^q;?>qQyuD9E--jmrQC~)zQxLCU+d^v8F_04k-KIp_+JY&* zI-csm{PUrUn~%=vS6WjfMB-gOM4NudWl)m)r|{rsQ`duo1n+`R+8dBB;j~KZGFI3h z(7Aj2cuTa<)!dfo-JAklqQt$#?*X!z_W%dL^&QA(g*C`mPcxzy7X$D_TaF&@=VzO? zR33RZ!zEwt%tC}gej#Jz2}DU-BY3Yc84@eOc_gGxU3ve2@!tD5j1TBB3TlIaBb+U4eD7F?O+5gT((4Cv z8}hAn=b}ieFEHN6rN4Ge$Si?WXU`mcq>OsAS7=V1Z284ORieg6(Ql~|```w8L<8BW zmoxYzhdd>6ZpIT?MVv0^y}s0dDnEm~TwYCzRaoNO`ikR93RWAowm2NksrZy-?6ma* zvWpJ};AF6|@xilN9LL^DGZ;+r)J3;2tVApdXzV3<=D zKvgkwf*24I1rt<%fSeJBS0_$mM-XygBLH;l%iITQwt@dMrT>}I|LoHL9Q^;>rT@80|6`v1|7P)nn$WOG zw^fz$m-xF0eC~k~gxCFNA#S}O;;l5?U*Qy;7w_x33INFfxGP#t8|>Exzz?++pXPEx zNZ~Ss1}@j;+-+=HUpLIBebas{wNJBAuMHyP2Bs2&wOFzUn+tBw0HB0-RRTsUq44Q; zzMU%suWT0dxFekL^RW8UtxDj z8xochk+pQWoFuAzy8DTt^=lou+3?Wi`uiR36DZv{j&Us$na;z>ezAslRl_^hwJ||L zK3Dq?-)h{`9KTZyDj>YQT{2e^Bau|817=!98MGIuCN^)~RehqJUHG=}MXm<30-x@? zj}!f7c+Y-r>*Y=~s;es=6kZ&X5^5I_asJkEb*G&{X2`?Vs-AnXW?47Z z=#sK&ZS+)wfPnRrkfZBx`e=hdze`?T5*{L6X^3N9u)ybqK|41K*DvmK!E~AtymEbX zgu*5>@xjPKw5eN{UNOT}e$0Ty1!@pOC09M~P9cmju<#|aq*=MuWIUJkewLheoMZP= z24Mypz-4|!qa9+#W%7+|6t#hyg}Ro0^?cpO+Urzgoe-9_p?Avd5Jqq}&oOXaocG+7 zf2a(|))WGSx@22^#bx|^8HU$zdQvptcuQiInrFhkbK_J|+2%wol=S}BXC*?)jy+dn zuG|5cEF}cQ@#Wxp)p@v=ezhNeYok*`a8{TZfE)?nrW%4?Al|_-*F}7E=-TNz-_>=V z`jI%#LB`uJo*x6W6<_dBVo~@}QXTZX-^la1Wl#R4Ce)ccyO}I|4IZ#tr`BXxV-{Fc z{8h&-pgeH^{W~GUE~>ogG^H-y`F_e1(wTYHM^WxkwdnK=!1D1w`{JI z4O_*$Dc4nJJA?0c7GR*#e@O2cHY9(eHJ_I8&LOSGD0_qvuk-?qBqGUXu3 zE^0ypgIsT$%dYXjwwYX!r{E$dL(Tf`y@ai1h0|AfG@X`6xgQE5O_Bh2?hz^wSyuik zG7GKoBGW6b@v2-~kmx5mciT3$r!w46`LGx7%2UzIh=6<+{-}db-&>-li`6Wbg;b(LDIjAY5uK+14ak4pSe0cKn@k(TC39 zhZUapPi+UtE;;)9l>O0%|MFr4_7Zg~gjFCJF`JE|a2|0uu01>bq?k%xeOpYHH!T%h zvixgO?sqNVZ*zJypnsJ%Ja~qGJ1V73NB)L9?NQ6p6fd{8k(obvBg>?&L-f?tN!^#> z$F#ME_u;bFQv?KB26ClNmUDEhM_Eyx9BLd2PmW8B@xpB3LkOgg<^DW``zgo2?Xg51 zsofje8gV#%8~rusg#Ph2XO1P!alB*HeeqiJjo#s0vyVe7u_&~-HXo=Db|8Mw@BwFS zAxzSQFG>ry5!5h4LFD^_MK&LWxubodv>L4GRif4vUdP@y<{{)|x3{nF9h5Xkp&ts` z8m_k*hST_sXx55kT{~Kit3^mghZP!ECqqJ}NlQHd^QWMVm25Vr6{Gftf>$R@hjUGO zEDB5r4v#j{yR*+6w4Tz}e<*Rru+YzG!%acdDX6V&?R_lYW)kN?N}tUEVN_0Bo&;e7=~W@>be z$(&2{TIY0bJF;vcwuz$d51Fh&lk^WKw95gAgWU%>_$vEOrP-vs%P0I~)>|*;(8vpWF3v8#kx}@d z;XOzoP$P#qG9hz(a(N=M1TP}GYgqp!FcvN5>hBLSyt%5Kc4)u?zW3TLrPG5W$*Sk# zGhfEiXB$tuzuYqnw{ja6szp$~W=E~hL7G~z*Wom{ms7s4>ZMe%jlQLQ<)mjwl$Z0& zHOca?;w{fpy5zk2@)aJ4_-r3(drs#&Iq$4h_IHKi>jo!8@QZ_+icxr#A)qfvUIdRALvMV_km|&5@zD*`Gig{&%|hLO}#_1fsFg521}0D zeYz7ui`ojm*!BO`n3@-?zLo=DJV?F**R3VYVImmsV(CUQOzQDp-(D|Keq`fTc`0=E zBs|kXNL=(3HJkA7Z>i3$a-`u0+)1=38ze=?oO$p>f@CkZqR_3cHqwbnv*7590)cEN zlF;|*W1>_2{Vu_62vJ8yhz2EvrwW?R*rOJ?&3Gr?mJIueGYPJ)Sq`qz6dc31<7GG& z+)gRLbC~Z_<`cXDF?Rx~2>!(KgAw*?$n2!ilbcnW`P>_Ib$aE=JkahO_hq0c#3v{O z(Qx+Zt#AVfv!9%~=`onM9eP8AXOg_pynaBmV4~O^1#I9?!Pj-0S%{!5!Hj&dCJeFy zGe6J~elA1&+Y+r_H%=dYK!BHBY2K2Xn{Po(V~!JqsAryp-;O zKx+OVE)WyYej^Go8w_QtVCca%&3($ZRK24=fsg0Yd+f9zVE zbi_Hhd(%9*A(oW$t?KQYc6EK78^`ECer0}5Fzh|l3BN&o&pHt9pcFgW+-^yWH=Pub zN&+WpTyG1Sjq^{gcgw7IPKDV2-T>uaV2YYT#A${*UurVXeu)sTZ0Dl7#Ri?fOC9u6 z30Z+il>N-HHK+(~slnF--7@kaU-Og6xFg+pr}uB^YZ?|?7;TIauN`=9-gub=)R4{p z-!OMANLIwcWdy*;uHPAj?W!z*e6#+{3jY=i`^zSvZh(D#MoejI&^}_>Ly(vi;iCZe z+MZZW`iq$K2WZ-6o7i7#9tz?A)pt?90%TS*!DWyM1GHyt*0NkgZ$eg+r|6bN*|UJD zSh#mIMVaK=@VAMFHHD0-De7UFvYD4<#;+CN#~=4;fOW(h^}?Q4b8nen^3Ljml$pO$ zF!wF+qQtNS)d)$43d1jED;&O#Ic;mir|Mhjr*ERxIRBaT>rgiw^v7Fmh%rbodeU=PU&f7 z-kj0!d$bIdbuy5nUT^v6B`Tt$Dczhn@;1u>e`ptbzPe`OtuTI#J)nMew;sXS1@S-L6z+Mt>F*r%P_ zavsH{RY~;J6(P)i4zvo~59cn1U3)&iCp8n=Al&*m9{(|3l3>=?rEW6N^=%^aqG<)m za({&+&z&&WBd>X9o$&3~as~!5*5SSG!O$Xm+Fs zhL-YbJ-!k<>i)4nlIIt|{0!q<8}Y3R2M|Z40zv^QQqL<{h~d~Dpr2Hvvp+!J&#y-1 z!K{!6QiwnVUdRx!oMv8&wDtimuRk1XrI?*LhH&Ikn^S86q9o=tn`HX66qYD--N#3k z!!9;r=du;XdCxGl)|sKX_b4(>hjZ{&P}lrVPT8+uEQPkw$kjfu{r&{G+=n?O`ICu; z_cQm)jW(tOk@{IwwG6&LRV#o)LkK2V{9~mw{X7?|mVrEb26( z;f8`5rjMXS)WF8K7>@?)Uwy5pt&5e)Txb^m+?k;*Qo{OSdIO{8~wzfk==qCL*NaFW+F5z8vz|5&ZZhw?}srX8`op(F^8h zC%U+AQT-(0AqZ)cD9QXlq+JXQFjC)ukRB}~nFfG5N@;V4;MhYQ!73f?AFhWvhsxvy+1Lcl&a$FnSjJ7QremWY{?IhH!XR9>EF_fDt^w7r8`q)Vj zcjZ4{TuS~#A+Hqgg3PFdDqUj6V7SVuEwyo;w|SCcqCp zrUFKMJR?2sj*Uz3G7EM#_&4-QzEJ}08vjgm_%ofwFEIE&_Futa&?`ScojPP-rpFQr z5X)hmpd+n6reyfXPx#Bo{&(biU>3q0cC@Vu6<))OGePG)_a&F0B_*!KBLw2&@7=oM zDSKY;i=ms^UH9!-(lxaNi%aNS8^UDv{qmZ!64Ozawa?2ILIV-9{kb3G@7p<&#Yi~{ z2A`(v*$o;@d7Bu`R7bWx&kzD=vu97A6y}XuKT?jrHH52m-=MxC2mXR zbu5mJWAd$cH99!Q^3+BgtAgCk?sfki>%l3|7IkppJMfe*bX(+5$2~W(12*&zP#dIs zrQrvt=Eg9@(60@)4Wy6L05DGf3&7}JNBKb8@p6A+OHjVT3besqp%~!?7z8B&+$QH& zQb3ScT2EYf$I&$2$@S*5a4M9GP5pZPS!LPOGykY?JD8sI_3`GTA0TVMu1Okq_g8y8 z_X-nx>PFv&$y1wCf(QXp0glpqSvWloVsg&ax6V|*bGYDI1o?L3I-{a%<=EVaKTIY2 zms{!#CH_TG=@6hYRxPS4S5N)Wm{?pT(+*pX_DkJXJPFh(-Xicyb*~i=^-36!{2H1U z^tD=IH~nET(o0H^?7uU|zcQgeGvW|Mz;&~ZhK=oDbXC_g#!M~?x;E7K(-Q)KpTNxm zIBH%4rOS0E34;$JdoQA@aQ8|RXD@xnwm49#zS?Un9e4wqgqBk5_xutXP|PFCdLllh zABn;B%Yo7|>Dnh(V{$6m8PpzctIp^>)#skQOWlUTs)q0ZdgpX{&r9WpCMWtp?>8k} zw_Y3obRyR;P|M#r_dhG~Jr89i@&Fw+tyHS8TaBpWD}HQ)E-u_;%tWI|$i*gqT4ck0 zifgK=4dKn>{OY%2jZn%E`EjX$#~f4KpE`9SC?QVD7v z52Znjp>hEw$>+7j3KaR~Flu9lBq%^-M_v<)UyN8K-aYsS^1a{s$^TX;SWlpyg*Czg zzl-1uB5NpTeMBH;xU$hnkI6AJRi}<}$+)H>!v;HB!57Mw>THYt@~i*gj4Mdu0gw=L zn}7rqg93oP9`J|S0j669(Q%akATWzbbCSSSibIxPnW-m0)&v0VLg30kjK~0xFAk^! z4IaAwyTOBWi`ZWqJbe5IF!X0<=&$Dsz603W6TqP439&%2muo0sM}xe|iS#>U&7Bq$ zURL`DDC62YihbW-ot6benH}*1q$TQzVGsFJ4SncU;I~{ww3p$u$*58!iN#$BlD_-* zp8B^ZUQOY;7oUIo%Hny=_DLf*@i#t@=OFCNI164MfI8MQ$JhMm28oC~LcT|>t*OPP z6chcyo#l|SyOMNmwnjTUjNgFn%|9=J|IfDPZ&AcR!2Fq^{k!a{EL`x?pi_#nLczvB@H5EQNMn&B*W?qm`pt}w!FN~ z<3ewGz+xb%cws8=)Sj$4KEn4=aZ)_BqgS`C`&xTJd@mrIMZR58xW*7)t)Dp)*Jgpe zF{e!waq_j&7=bH_zQCB%P0^YfXgw394|&LC_@d#`9G5cJEgq!2dV;1 z-&+kA_cG5d#cZ*Z$da6ZR_!9snyWs%Jt9rC|@GE%(icOJQp)3}ht$WFyEw0Amh z{5WGcrfPXXR*&XvepK_?ML$EK(wgZ~zt8X1&;U#3Qb4v^%N050JJ3iXkKoDsW#+<)2hSI4-C z=ne$RFXE`lEv6(a3H3j11;T!HC;jaq{>5JuJAFhZ7V@1fi~-3D z5J|2)VxqvzVAc8#r2EbRV0HmUelu~Gdiqz_19dEU7`gojaSpbwF@o5QGzJj{|LdB& zFxmn;Bo82VO?yCoQUCm!qu?YX-U^HduziIH%btH`6+V|_>#rb>SNVW1MAlbz1Gc^( z!UZFluSrVaPI2xPzQ#X2W5=&py}!}1BfftCWDxwn&gi+r7&k?Q$?Sx$n*-iFDReaC zTbU7|RWjvnY`5mmswy*2$Ej#@sY)1Vxf1)oatRop+BZ+wiK+)^f)p_V8asFgr`-$- zGZPEwJ|*(BT2qvGU7;ewMxwhr&&Hj{BBq^2b{*O$-*e@FU(O58%}uvFKVO(@P(0B& z0=TooEttz_AsL+jlu2Rx8kC9<)cXA-p&A48hf3it&ICK><+XnPxM~npWpzsMwj_fg zPT;zi_*a<=HN+ioU@K7D8!pVm0WS`nI9!{|>RtQ%T9j$^TO@yC*{=PH-z6gdAn_i? zs784X7EyQM+f&&KH_Hc{rDWE5B4U+1MaRL_Y(Me)b0Ak6?^0|jGv~E)l1L+&N|=Zo zAhqQchF%;D5|MML!&f%JPmLDBg>aJ(2%4Eh^$Hv^&~^Y;x_k5?Y_B%@yPkc6wz&Ge zCK;ZGkax50NCU+*PJLbNE1#8ha(g-l9(>OCD4%vo=noJ0{5rz>Mw~&(Ky$RAlR1|k zl+f4koevjRnc?N$P^8TKH8yklgMO6*ldTb}TN6)8GTRk3`ON#7Zx@)vuU>A|6a2m% zU7O$U2}HdQIQtvWX*%>;I;(rKJsjwoLxBxNNYiB~FJa{9hAE%ZwW0_|CG4$&laV?y z)(0lx$8Hq(EPefI#eTpt8cX8{pDLU-J}Bth8c*!4N&!#0smM=SxnrJ1JSOuJpVSc4 z-Eqn_r_kDT#(amd-EyxypRorv<&?)oi*r)&u+G!v-GYhAx^*+Tt>?&w+xs|{aeYj5 zyLbDrv=+u}TPE2waR16`RnXOx3A0v_^59egGtRu>&?Ug{O5YsplvlyBL;0caVj=Tx zoQ`CUaM~L*l?`9U!}q>GgHrKRe#87~JC=WBbk>$Z{Je)F?1nCns6i<`*K-mR~A@ ztpwHu8;mw=oVXK?l!<1e3iQ}?%@H^k_q0!oXWVET44V-AjPzyI z8Gi$2T4yG`-UmUgNGyb=VyCciE1Kc}{%i%WOsuQZ>72=@N2G=~|XSc&UMOr#vMMtZz z+4+jC6{fLPF?|^xiGMi)__rrv8&&`TTit{toR0#8mLow3x*R{!~8!!jdMA=5`V7dAqFq@@UnKg)da=3*sv;*UZdp%tiCk_t>@0)oyP2 z<{u{QEnWA-in@yR^8GwKuiveoB@8HMb}+*IajIvmie&KeMK$B9hjNJ|oVE zRq(>44qE|#5F6iUd#gm!JIjLcMZU>ms4GGD>I^GTKR(}mPCtP%D{z;CG0owbaHij+ z7*6gbk>_gAn1b(4R7sCc$F`%l^~{#-!8|G!p?{RHiZK{|Qq+VBsYdax$Pv;z{(|)W zy^5cg@MAd0hD9FyJ|W8jh;te9N6qHa>2_gNrr#AMh;H00_SYKcde3_EjIm1iYs+JM z_d)xK|H#9+fuAf-S;K1)(zt3gdBuVj-vn|o)099OlwpA&csbylKOPcB)+6Y5Gb_WV zTXM%AD9G2jax|{yM_s@7+BchW_t*5+Y#q#W(vWzH5aA7^Bb|ma4ku3oS(6STAV1)y zZeUAI0J0TA%Z1p))$B4FL+_=-ndH_ z;$RQI%1|oMwx5EdG&NV$T$_$e^k zzVFFhM+R>p_v0lBxqOeNq(!^@-D6kU3-d8GQEcW{;sofZWjZIFY@_vkzoNQ#OE@nY zje1aR^l` zA!y`lcQoFWP|#&gfJ9lkiXVPGbJYeX?f!~j;EH|OOqRygl^xw$^>o4Hni?DOd>s`v zmN{RpquM?k3w+AG+oR6P28lXl)*>HTb(jU`${ zAf>cCe5%WyPG~RK?MSw$W;{{XdltLLhU?vGI7Dp7#CRcF*vp$Lr~;y)wdvlCYr3)T z1$kMUK>n8>tS@yoaXL?I%UC*Ed!}g`0Iisg=i3()nmCR6m76}shGu;6e06_sd_78) zKGN|Umz>4WO+$4ojEjp zeYweosnXy>2_gz18P8gzMLTgLPUz#w z+vVpsVMSGhc-1f1jlx$29v`YJ;i(Avzl4;dhEimq#uXch~Ch|zn9wWSsMCbG)*4-m@%hxu1+;s-qH z2gr0CvHMAGVRq|)Q+GF>9NoQ*+Rr8ecr=leUL%#iYJOaS1+#IiV@p)wyeKkP^H_j} z(ii`Y7k!Q{-XD@t{{Q zvgA-0iAPn47Le-wC(Q=I5Vvzt=+fx<@!9 zGWS$Jq%(7D7Ulyr37cmi!SjU>Yw`&zA2RFAY&r4B2}QUTR|y;C?5jawne0fPgSU?& z3mXu$ew;+=L65Do%LC@d<7I^t9&C>{Gb&EeRaM7<=EJGnTssP39En!$UC_ zHH?f9ZzCS4Jg^G4L1`!KBmt^1IQ5YdVSW|CRO8@)-U2PN$PU)PB)@#neRqbxV&Ww2 z%Rz->_YsVcl1SaK_A52-X|%jT3ZFaK7xbg{hIiGnzYIUQzo9DqVH&>+{eU%ZnF%x# zM`;5R^Oh>-z3wY48X9-Iika$Xok0|D#LdAGosy?H73%~)sMpSTdNV&6{w(_Zt^GkC z2GI%>N`^b&@@D7?6g;S3ks&L9ju0QUu6> zM-D!BZq_jmaJ%%8**xrWgBrDIwI?s_Xs%atFU^XDAt-pF)Unk5aMSdhY!eMy+J|!j z->lt$5}irW>H_?A29S4)1MaPsD_zx(+mcP_LZ0R7i}j?WnbEiY#aaKi)@M0OU^y}( z(9Uf&7lt5C!!M^20)T99bYv#!L-vE!bM0?TWfJW*Zt8W3^||n$Q_3@-W}9N7oLv3~ z7Cf!c{<%TOuA*9#@y5GniI4Lgq#_n-ke>DzuScK93pPXYL4J4ON~9MkCztUq%*D~r zA0S&_HN)Z3*LFwe(mt-LJ7=p3`FNm!JqWN996(Gx2MC9rfUE)|CJ32(kLyPzuGe{c}G!_YH`hPzN%W5_YW0F1pW-L#4}doMzVL;gZ_@#%E09p+nf zr9vqbG+IDRTpO=Iioc2+H%Gs~eq0q_t>YP9VmVw)XLnY&n_#vKrmyY{w-()0w|gYI zkxDuJj~3zYAO&%v63m{D+8K}-x%PrPw7|zGL6%bQVZlo9S@?>_^JA0L;an$;BnqL>7-`J>wtduwD3>DZp-}pc5eR({TZQK8dlE|d2 zSw@y*mk5O+*^*Ep>r@B{MYfC?k$nrHY@?*fo;^#(z9iWRF=iC9&RE7UOTVky>XrZ$ijfBqQq)uA+FUy~EUUU}E;;RD#k-^q{QSkYRyX6n zYB`^?jKz&q^oKJ!3iLDE{A;-1ul-XX)xbb>?ttuEfHgrWN0j?$906Jo*FP|3)4yXZ z{sLb7?c4m91g!*0+mSosjX+D>GNc)RzdM&u3u*KS@+_dLC<<30K3~L~>cKdJ8ORbH z$cnX4F{09vmoBF{T&;!DT+voq`=ciY_lAq{^}UYQWrPN{rNF z-eup(6K~UTFj|di$YuWn!7NKD;+#_#`l9&eq2t9#Nia zVV2-@Jn`j8F4vN4hIxywll~5^77rdBtwKjscF7mJ>&Eh&oX{(8zr%iBYR1W-?<_+{ zpvB>K%S=Hv=-{&dq9>xT4ix)QwIy6E|Wb!>q~-R2_x z7|$cbO7*3Qoz$vlKu4xdJm#SsV!uDTn~@3oVj_02Q7q;nZ>S|Zi#*3LPp;vb7;;w z18{wG_e*kwe))*ZEh)z;Ai0e9dU@!As_;y(zM!s@@*-}oz$`g74QSQyfYbSerc1rz zu!6Sd9vkfg`fM(u1_N5=rV5-4O`vVE0Rq-2m z6wAGEaSm~5?UcE*-_c23@v(t0LA&75*Gm`f`np{m{T8Rf<5SxW(8ivUHNJz+=Qd-> zB0t0qr2FaPA216dNU{Lt19r(AP`{up5A*{qYM_L7+ay#toE5hsLU^9A!hOb(dwp5; z{i-fAGh-~N$Zbyb!_@%kktb=+$H(wpSr4kX(8tFzP8brrah!un-q zU9cFOzrF9(E7$m9=1Y05Z{H0iyHZOeZ%P|`y>~CQ@KJ8M`TxNIVz+DoS{TFdf`M&y z`VZkoxGa6Efi)&g(dxmHoYc9|rM>5-WE{&etK$fSH^02OFv#cXZ<@KEf5^}T_aJ{G z43Po|g4i~bXiedSi$k&)KQ;&Oy+AdlqC@(b-J;it{UEF8*J2>J_YC$C4B$x2Jf|NY zYLo|5cX_74WZ4^v)BYwn>J=sL+e=g)zaoW6s|N!6q&j={zibE=Dx;Xu+qdnZqnmy4 zu!_zoaIoe{|I1S%L-H#hdyFqki%+DcNrpW4jeA&~Sh?)u)X~q1EH7Qj^(6XTt0&76 zQLSho57mJL&j}i}ZX@dxOr;cu##1F8r0kHKBCpgMq5HHTX8)?W|M_zON`;=$`yrJk zEo0xqA$6fjm1b{oE zz@d$&T>4+%QMHjj)D&4D>tWug)y@HArDOLL#&h*EzMwsU-dQ;^JG_lyqYJxu0oo>j z=2~-v9if(}nd2Gx*haJuzrBgq;H9qotc|k@rmu5OsPani0b2h0 z|2Oj6}9EI;6KBU@LMQR%SUW z^?Cu+#CdwU$l>)NV8tE0Z^&~evPffp*YnfW48y-%Q@`{;{$%b~cjF;C-~{fcGm~dK zA%_|mUfNB%K@WKvlS;+5-4&S?4JEH#B_8ETnH1P!R0V;+R{#8UGPWUa%TM&-`j%^P zE}ji~PNQ(MLX<=+o#)Nj(kjF5iRV}MT-IpY_c(On`O`3AA%21m$(Wo#?CVTL0fEp; zm&mYkC2c#b0c3f`Gvc|JBHs(K!k(6_xXeRrVJKE!=E)h5!U+Z!#wN_|Qa}@_2~#9UFniCJaIMoF<6R*g5CiO`XA|hgXn=( zzsnR7B8Lins)TUfO@~F{K7N{VY|>4sV+^ynlh4t0T7h}uj~K{*_)3MCDdzM~LQJ%$ zn>I!)?#GGH1fTU|gtbSNc>8s_vwP%+0#SDog|;*g@%b_k@NOG45ftNRfdJFCBg zsw7s*0>F{*faAye)g^;>VPW`>3&Z+J#ZttFT^2puNhVoZf1!_PdeZ<7lmqMleIS*D zP$aAt%ZF-8lD+Q$1-8D~)@t}f_Ak+1Pn}~l_Md&6WEt|lNdUS$NpU2<8tY`GC?Gx^ zy=Z@kT;W^|lqO%#e?@+ADp84p#) z)2>$E{*QNAdK8%J{%?5ka4TX2{c;)sOmFK2P^RkLl64K6ctEDf1v#PL#yZ;3g&X0c5t` z!R5yT+U@u&ff=kw%fUFq!XcHN!6oVp#Iw3Q3Ig2@NcO`U#wgw+HeK2iQ;O{75p4T` zE-P@dY$Sb5w1n-;Rl(IM6kU)q%CmBWzX&3$z%QU==pn)L-R7N0P4=y zDCi#z2KqqEM__Q8spt2=GVJzhs7)u;l%B+l}a z9^`2kzdPtXv>6OTmuW9U)J;7$nzuPSZl*ZwX5{!54)taoPkM!Es)f&(3$sRCjU{y;FRI6FvrO3J<_x!~{A^2Azw!K9R~d>k(U|s+rw(D1eh8|BrYo9G`7t+T9>0aT?OA<;$%yH*mnW64js=R zy(GI+Ed6!NiKVU5c3#t+1|yDx&c@aQ!ig`h4QefwO8+j#*c+amX477F|UUJi3AhH8hdHus-%36q8GV061t4Q0zW)7(m%~i;-WD-r!&U z9OTnZ0*$c`sIp1hVIFYfes`KVYAKx?~*|3abLQm?Q`(5IY~zg>I+SoI$lN;cgVN ztQn3XH59vN6CN%X4)ZIt;O(`##!B=lB%Ga^ofp^_dIWF z>uOURqW3@McJ{lu=QSwFun8zk+e_^0D2;^>Od(v0(7jNnz@@$eRMDaN(+D|oJh9Rf zf$4O+sd|&oL`z3OG$4*$475v`F=>(yhgGBn7bAq=a{ZTES_BNey^jkm(&0DIZ<9gw zsHPJwugw8aO!sF_03#IEuLR|CNt@;;$|buNxLQ8}y9YS=9q3^rDzu8`Ix2{k<$xc; zjR{M$%+w>EZE4uUjr*(??N!MPuWk7;9Aw!%S5I{f6voG;H_1%vE2a*Vrh4QA^8|{y zQIUFvyfI@0iNvs}>+UPf#U6ZF+9O+;bFjac4{)%&6w$*Rj{)KBKO~Li^`nU=4p_4{ zO@LBT6mTdmCsZ2&=U2;F60#zwFyWM6;`$EC@7Jvdeup#hyZhrab9M~6k;R%bl@d|uA+60;iH_|hlK(6&$MZ`DhRZy)}e^tHm?%V4&1 zI7LPs&FqYxi|r%d!aRg-Ufe`d0Ll`%fSOBl>_yH~6Eb({5<8!et;$pu;AnTw1^Up* z|3$^R{1#>d@H40;8*cz4-i>?&fSdaCzk?cK0D<(UZ-*|CQS*Rr#5M+`?4z<1zJuTZ zFAEUX?oJ1EsCSshh|rI%biUFV6WS3C2z>|nL!diZ8K{_A2bbcJSrODzuqM>*@dr@a zI^a_PO5eu;4+Z)K0t2e_&w_t3X6Wd}F*>qK6B9-Eg%Em?dyosy7QTbFfe;0d$S!(T z3UI$32O*D8m!Y_HIulSzd}`o3Xf5rpYU5U?U^_qjw0}?cher9o-|nBhm{q@nVxb_! z!3F4n2Ag6)9u-_!Z?hyQv=|sLYcVAd!zb06ofExT9{U}nSelS~A@3IcOR~MCN~^=Y zBgOmQU+T?W#e8a+IBPO)g1ckJeXK#*RkC|^=`Fp;jdW@e?D8|a{ReHD8jRWh3eGQL z3w8K5f`PCWu0*r6cz2i_9V9Y3Ege%>_I7UN1w*#>yS~1Ub_qm*;NQuz@=l*5riCGd z$PQ*iRNNLtmgoH74HcfWv7|_R7<0acp(=x(p%{oo*p#_hWumXcge*CVIURvB^ht<7 zR#>3!57;l}!Ly9om*YYOv^HZzdDV0mDWCWEGp;6tV-Kpc5+a)G{DL(F{5^(r`YYd( zY$hh<_93ID5&}&=9(M9ql#dMg@BcUgueZ@r>!o zSLA4rA2jXIF$-9OFPmf5z!QWRh=d(?CRLLIC}!lFfyFFN{Zy-5#C)43H|DHoWq0$$ z_+CR2r+i8xNjcW!lan0_ zO`7Wjm4|Q$VW=hD?;&+BZb=q=Zt%t->uVGG)v9B)Tel>{pGDmgOEMJe-}h^dd~Cs@ z*|6f9W+q3DZ(5K8)El%^O$o$@Mzuy-9#Can?_>sDYp^3sGB2NL1Eo4wMh127yB0ad zbQoUpN&>XYPja9YAovi&jR!qB{EvTIu8M=+v7V1ul28lOqomnoo@M$XZ#6Dc<^484RzDNuw zTw5sPl{Ga{Php#<;_-!MrhpGgr_Zp1_C75LX=0p3GE>EVL;!ma5%sc&fuy9eMgp?= zEKB=*m?q?9ufXnGh}Y^?IKLZX^Gu4E9*50`$Bb*mqWXhyXr8r;HFt&^+I0dh>F(<4 zcDS95L-=n}gQk1m|Mq-#^htTNnAL_r{57bH{P<)WV-xIXrMj_azD=TFNhK%pb%W`F zGADk&$J($Yt)94G9;6d&^4u7nRO&;3&B{mg0m=bnq{3phVEsPTwP?AEB&Vq9JVekl zG0D$)L-RRce{voXe4cDXz_wwJ!1c;CGjKt2^H<--i>P;7oaos6(ZP}70| z{6BWOqJL>;<6(jvnzWP|qcVxVRG93LY

pq&Jg!(lj9ahf zY%BI)(Qyhaw2`kUKE#~Pnn8~phpmD7k{=2R z<9UZcucP1okb@Keo!Il>AG{+lDU%lKvtrXBN=l(U*=i%`cW5Z^-MSXt>3~;SBupLK zQSW5r|KyHf%6h5Ao=dt+tGF+paK#?3Oe96p_mVNV-u|UV&c=gB>T`BpRbf9QCT;-4 z6(K*labCtxnu=6&0v2e!OBW%R;zzPJOAk))=g$T+ioGm<~-jbt>0uqs9}IZdBP zkP{4XZwUtp-OUmD_8;ctSIGQtEr;LE0soICxBe^h`cta`_R+G`l_?er_E2(O07ct> zOn;Or|JqH&glTg8CUu08?8%qz zo3LOz0azQ{!C~nB;>F|M8_ZLwI}z^BCWR22j|3F-IT>~YDFTR5gfQ(1Lg6#qkh1q9 z2EAC#bK3f2x(B?vaayHqPVZpVsE?kU_SuV}WS}5%iFBWqp>9br^-s{Jy+OLsV8%qT znnc;P5GucuVqdNcDON{cv>?^cQI+|IY}dYXy>gMbrNBH(tfLef$%KmQ#0)xcF`>f`6|D@kF7Y9MMi?G45qm5>ZSK7w(O zkL9+$LClUSY(54)=5%ba-^D{Of9iT0yJ?0Y|`3I zX+92R_iQlH{nHuc4{PhL4{~Rix+!Is?xM@PtKPB!?HwzBdxihqBp_VKW@G4`Dq1F; z57|;rIGz8MszWTU=)6tT6dSDEc#=w(P4WFI@@>QOF48ZAVaJWaN&jTMz5I~KY)v$8 zj644B<#JFcp50vlzEYPd4wYEkDg_YQCgb-`e9+l zl-792rDox0v8?{Sqvj|*0<92E)&msDjzMSrqEh(>_G=9qpdH=Et-pzTPIa*yxaNsrb3m~&0j=6&+ zp96ZW+%_pC`jj_?W>ayrIvf3Z5`;!ONqTaI)CMjI31wq~@{m!6uePkZn7%M3# zt?TnEo`1RL3tJcyF?ldp-H4<|{zg=p-40gvE46j7H=(mb6AsITCv6JIE)kXG3obP;-9zJBP(T zmq%F#^?XFVf%H#?bZrk;Kl8Gk1||hqm-bA!U8kbW96|!c_#_G%cO( zE7yqsbtLxBgHi%=e2kPrgmgk7@GFE{H3V-f!W{_v*7H1T81~}F^cQK*R6q_|@k~-* z&0jLyGRb&;{Wn=UvzjO2)&$QcZ1}`lySxyLmZ9J89=pD`^O<@?E z?mNm!gLQ>H4_Voc%suuE1~+3MWX4X+LAbFG&NE7rYh?4F$39u*60`^Lf|toh6>ye` zm}-}Xwi3g~5N##_j{bh;RV4L@DCq-$>qV1qBl!I{%nws_2zKRS>lb#CEr!AmpKT0q zGhxp8UPd$H_2{foQ6bRY-nh^AflvvAttbPz3VXM*5-b*3i9!l~#+Z zpM;`SPPXI=GwysO_Yng-V1o4HZWu20fIw25ja}_2#e7Y>2A!|b`yYN2aN*+gY3deB zGGOLsI$~!mSXC)6Gx)|}R#)a@+YF#+kUS9#wSdlKK=wdqa)kDz_n6gIW1 zNsfQXcu8KJg_?t>bFEXA!^dk4R$?cuwAw+4SHBYy0!QNtkuXUbC5PeoWRSN!A>Vwh zr6(#MWJlb_+!$H&-hSlw@#}>|Dr1*wm+W!ivA1CO{G~gf;07HZ98AdH`D(v%hF%-V zI;F5#(fPU5%we$U1CFla!6$L)ZUIU7Y0KzqPtTk=X<@-E9HPb~w8=mLm`6;rw6Yl* zn$8IZlj}`&h)0?Uo$f`2hahG(lrq@swR(QXPD2J&(}fd^OaCCG06^;f^V$3^VzqA* z9iqkLRy!H4vL%`10B;4w2zBv%gsO}P_iTU>0^3%Kzdi7wd`nZ*ctR>lE)bn{?26Sk z2Wy%1mMqcE8du(xW6kix_7>%9#4KDCN5#?I9dY7shFCW~aXOIi9g%IQed zT%2ToGaf5`d{!<^Gn_!LN0T$OhZ+p?4Emf3taSrDJ}6xD687m@zN@DT_Z4|W7|A<_ zZs#7_y43kh^^Diu)P#bS^}hj#B2!l9`$1;gwm#AwiU7L4RmTQC6vFo-j zrN{4B#eaBo04xn0y0h;Ql(!1Yai*Mydr{=x&9XwsmYUodVA9mBeU$seY{K{#wOJ;< zi?_AAWVIj#u!@9b4$2usucT+bzh#SCv48dv@c}QZK~XDup+&p}lm{++^5!MghA1 zH1qispIxyRr#(|Qr{#k_pa^kp6*7LT^xk*Vd!RX1=dQCmIsnig(AR)GP7#E=4>lOz z%emdiNkoQwoCh*$!zW7zjUDb>ZTF{jTzAgXhJmbQ*@~(`svbS z5%~Dx8$5m_Q>mKAt!;F0HaGWP=8Y$X;FO2LLcgPl*8M7cn&<7?6p?6gxz%MZbFFT% zY!D+LS$q%}ye{SrD|R9mkM}of8uRX(fgG@XmmS3>oLFF&Ty8X&n>vj!c=$y^m{Bwu zG(pS2%shcrf;(X4giyU!_S=&e{F z&D_hRV$Vpp<2QXww=G*_^z_tinR_8-U92FD{ZXbIV;U}uSeJje0ROU_*qLXUn>-&u zg{YbU2H@HB02W!ysSsimN&0M^Jn}qif0F2G>?thJ+59BN6+#v@iK&EC`KOUwiBPJT zuhBE0JMXBIf#+kSFFz_(@?B_T7vmbxAR|VT&wptavg|;dG$uM`Lq^m!@G$TtfM%~h zJd^Zk+WKT$^?D)O(=TSNpu4_x>jf^1yIf>tqIWY!5OLH6#z|!kB4vr%RUZ+lR__U5 z?h!luaN9AiRh0diJqxcGW7R*}aYq-DMD+oTfk`p(vHtTMc1s+lr7s-Ca^D68rpglR z<8WDui|aPyE0y(5;c9aH2UeROWJ;C3G0g=8SkL_9bLx*1JO8(cIfK)Xt`MNly(sAs z*#TdCX^=EGW>QrjAW{+2`|ROojMNv$uMvo6rS-TEGQ@FBdGNmEogr zJ|*Ct+c8}5AVP6xt9%&jK!a(hjmvEPh;od!o2Tc2t|Ei$9~xE!4OBlesG?JyRN)ba z%ErS*gubru)>p51$}jnvLgGOxIzuutD{@s# z4cvj$$F)7xTI^Rt&}wX2^Gt9OM4{b8DOH7)-L<a6_Id_TVFa&%j-HMOQ@+8b7hX~KjE z9Ux0eM8PHVCS7+^Dke%Y?g`G;!nfa>lYU z1J73B-!SnkzV6cVxr}8GY=uW}J%u9^$&I+Z-B{L!v6;K-u1ET^E!#7(-H0WO_Ak$O zQjFFq?&Z+*192zF*|XBT@RNw8?$)#Wjy-+EpZ9o0t1wLhD4O^AE3U{4pbWG6E!Yt2!~V&5bR1x> z!Ph>_sUB=SWpuZd@7i;%!3%oY$2eyMvL#int3&8QffONRKcxUU6TSiHP7K@ylLOY4 z0XnA6YCfPa!QV$$(QSg_<}qdSG;kf7zTXo{p5g%kdVtyVPSmF6BB06E0C0To+cN(8 zx75=B=of=vrdpGO}pC??~ZeCltag~4WiD1UFeko7k zRq2qKdq9^2pzl0cJYjh-bvNzN)JlGhji%Mh4`3I4^1P9b#e^AyftVE(1oB8C_x99PRGJqW~%x&32=3uZ(jvY#Df5< zh5yBj6p$Q*I#l~ukRRsfNbJqfde^kAnxMq6328+|L%;wV$U~los!a6A!XG?tF;0q6 zQ*^k?ASoiC?!my68{EXvsU>O#1cT03Fy;ujNF|XY0nR4%gQ|<^sYALm&W|!vcVo z9@J5uW$Dy{AB^jWY4)d-Bs=*_wGB0FVBK`p51w0kqSNSf3PU0=33XaL;Qb+o0ov(b zr%}KBV{U)^V`d(rB?$mDLhi#o2}}N#_=X<5vFdz}QQyI7KH-}R_Iws6#boJ&il_&2 z6owg8#e4@KGiXE7C!>O1!@0@C;@e0z>eVJUd+sqMqoI4~k2yP*zTQPPj?!7N@dX#* zX^Dw@d4hrgSz)07iXg!G{SE?F&QHt9?-Q^r{-1r!zKxS#K~t3BcgBvi&>6{TZgN#8 zMGk82PPj>yoQxMaA{E7ts(Vz;pxTuC>`qH(&B$LIKS(apz>oNeu&JCV7awb1> z+qis{6>smhb<}0L@iLi@PwFLG?5t8|k}qQ`;2h7R%CI!>E}(AvWBE%rHbDp*eV^9o z5i}DZ`cy$tJ!1ZQpZQ}}^eNud1LSWrHwZC66fWQahdd_-S8ARZ1@13|N#9t8w7a_P zkKJ?YtKHPF)E-dK7fHWQY<=UC_%-U`A=0f7{!#|}MV@To^OF*hRYCh!m(09l1>m@^9--yj+e?2RDqDi4o z_=>Sc@PcdbZy;zo0RhfvtG?H_c}yoJCOU>p)CZa3PYgR{JtY)?#FCr#JUpFLuX&pY zp>jr$poHmC%{cjX?qTujn@Ps6zWG_6raTl5OK}(%o;~}x{IMPj`i8f!o-fX5rli3$v%L~WQKR~dLU95>m7CD$yp?d}1NSe7$89fy z#CD7nOEZF5jXg8o<3a>Cb${t)u*(nqkZmTjQ?-~mM-mwuiXdyVe0rGQ;EN9Yf55j#`i zJru`DTb%;f03&oirZds72m{Hoxb+>o0|+ZTiy{5m;s7W-%d6+%DYuV>T6Wcqp0&v(LPT()x3 zz|l}m+;UFlbq+(U$9#@vy#5mh1-?fot@4aCu3ky`oMG%2G~RfG)gsR43Wv1s=t{L8 z@~DsRAjV;7KR!sMx1qnX@P^>G1esz18)_c%ov>|U!c&L4UkcOhknArkkYD`wujhpq0u|G2^?}ly|woW)BJnhlU#R-QLoUb(2VAr?v|Hg zzfjLoOyO*y=fkmho-p=MypchhbvQ~cG)clNVlr&zNOE|yAh`GeZ#-|WlX(yGmF$f1 zjm>Xk>$R2ZRGpM>XFGFiS7B^a#*g30bMuZ;JS!X6X* z)xL1RWS`wAm?0`Yx4wPZV&rUbsoM_1ImTk5q@stu&g&dKLm(Hj0l-F5J(J8X_f<-SbLw=5}f`=(DYPX83kQw_;m z1^yAFH?*26fHAyul-mEz_h>IE3cVtpG-OhyxaX$c!eGF~d0MTZI-6xm#vEk)nk0V3 zx7cZM8FLE71zQ}>va#JdPkTgX71`^ycBGbZZ~Cqb3L&xwa~kgbtfo?Pp6(nB{=M8_!4)pTC1XK&yyCFhA-c0FCD&O9S**gg8*+Lk3v7 zBao$!&Jz#-D^sA4wFLxW2|9FM%b8LtjSP@l8D|0S^(wMd4doKlM{T9RIKFNpo5AE8 zun^3~HXcR=0~CumGkU{Z(DQBR26zRopEd=UXz2f*k#Hm@{VF z(PR5$t|SQFa-He`e*vBz_r1S7UkNS>hkr&JwxVyPin*oU&cfy$#150zI`PD4z4$$+ zZ}roXyMho0=Ojjh9O}rfFD&*NRaV{FV$R59>BH^;uAnN_;Z?*3%YZk(VcOxR0Fsb1 zxdJcGNagBp04R3jMV;|JEIGqYk+_%THM-8B8d`oyGP4DRlDNo|5&M5MFlvx z&uH@cZ{Q>%=)!P)A~^EaaD)D>^jO~s_cl{)Po~TcjMANtU+)@r@u@P#qFUY-w#)R$ z^-9fn?cOzg_1RNw(%rMdP`+ON!KH(#o-&f zQ0h?~Am9MVsgdv6AU%MG$bh6iM*jPkfai2-4moK;I!mZo9>^gF;w@VthnGSoX3dQ? zua%Np0-mSxIh>aQh3$R`4FTWSm5=xw5QERuwPn}jxEvdh9PO%nKi##aq|oy% zuJd|scyuP;ZU99DZZguSi0Ex(BuAC@%`c|MXZFL4YxcRtxW24TI;UZUyQMlSbFV{s z>)P~0wu45<>pj6xPGvz+ds;2heI-zzXc;M2q|}TII?UN>=#*zwFJpQhEhP|FtJ*Q| zC%g%jZinm8i=B>qxFO`~@*;aI5Y{b^dB1(;?yzQ=`4x}O5ktTxG|@6f_EhK1>I+{l z8^(2SoShR z5=^k1{^%K%mHd1v2xT3Z#c%m;M2UZ4{rut2)?%f52VY$h*xAec;)C|L&P`3r_;}Zl z+=P(weBSp{C8eIp$Y;2#5fvGs3w>jCo3SmwPWdrzEYS~a6CK(#d4a%bE$jhz4yig@ zzP6!$go>70Z+82#bGQ0TN8f4u*{7D(UPEQF-pWCkLWm^%G%cxttB5RbIM!fZusDca zOii>`wEmFTez@!T2JV-C9@^zu+pWSg9N^?Gd16Kq17uUf}FfCm+Y=<27B-1TR)FqxR%FnCK; z8%3Y-c3tuS4ZYWYqcYx(#zVBMh5*gn*V^JaN2k+eW%}zAM*1505w)R(SMNrjs&n zC2ic>=u!_|;8-a7(gky=j-HD8v!3H?xJERg5NDr)llAcbGHjMtTgOrTCEie=x?O`5 z!XO%=Em|A^*M8cn+}X!r7+5F1kP65)qWJA@N~?7`^naO2j zZzz4-YwCq{;;$m|KQ~*=Px1r+O*dJi4h-ipK%`vd#kWRC^{BZzm+YfAsp&oUCo8_z zpEM5n9G>R3>dfS83J&;dj*gMYjDt~8$Vh;Now-GMG{cEkUm+2hRUiGkAn2;5FWY zez7B4eh0zoYUnZ5MOY$FE{(Z=mxp8vG{U<_b_MdojadLKj2jd0-%7v0=1ha5=ui3z zTE2t!2t{rk*lErAB}HV`lPr}$%>cxJmc0Onb&!98=G@%3BLqB}Qbpizpxdw;m1}AD z6hI%C$$+jqA=xpj^4~#dxA*iXn(*CQz!s*u(;}c>$PI?{)%x$C+@F-B&LqmXj-=!w5KP{c@ z8$p$gY@$_v_1(Q71Tg7lCqUBLg=`0O06bbsj|G5zBP2WYOSV6Z-c$|UJtn<%k%v+Y zC6snUTB(3gj+Ia z`OeA83=z8@EiU4}kbs6_&~kYoJ4Crgg^s`j9cBb!Es5VA3{ODEw^*36@5QTru6t_^ zSs!wf)3ms(=)(AU;a@h3zy8Ai;2syh9_%k)&EJ;opV~zehpIrYxXxfF^xKgfK%a6H z>(Z|7SZk2g{ra(C+*O;TK@(ob3BxOO6Bl!@2U`vYGpyeKYq}31Oak@Uwvg!k_1DK5xf8cWeTmiv_^OS6A>{}O0xMG& zthm^Kv)r34cJo?n5{Y3uHC5*y~vPvNWDOShUrYdSc)5?wF3;;l>pdaL?7y6KZ6z| z4)bQ7isFfaPK_ZqgAjIjG&Nb8dY5LRFZ&(jzjJC!2=lb>#jobXxV|w1R3zsC{FzT{ z#mI%ibb1#Kydyz?ZLLC*G_P<`19mG&XBwZ(|LUut9B>bcEQ6uOL0EyNfB}&@bLGz!Ry0ZdTv-8T5btOS7ehn82<} z_y^Kk0HgdO4&KR$+=D7zK#a|xsIf%o)(Y~Q0Zo<`2K_S7$P3&9007(!VLY;Z3w2O% zFa6GTbh7E+=pIK`6LSqk_nI?R90_L5&tcccMF7t&9ClRcEb1^-nppA-@2>cfxVf`> zM{GLrv#^Du8fe}vUoi5zo^}rVZv+kh%5M70%KP8qLjC94%`IL^hlW=cX8FN}i#KVS zy()d)_k;~*ZD$-&m$evcg%6B|9eA1^ox)pdxSVMEsK8K9YMB_AFnMZb(^D+f)IZs@ zEHY7?R{aF)Y~p2G2&T+6rmG7+Q+kr=ATiJb1J2853o<1b8;ObeV)EGz1Rs(9&=Zf` zzc_IFsL#i97lQNIRHfcGDZW|VzppWnFQhcMXyK-4k2nqf@n|Jl&odxP z0s1L_=~=Fl=K(X_^7a~v2D7M@({!`XGl}t+5Z_9rmAcEU)N=)lOQk11QUu7cvKOT< zpszsjuoY&)EbbPXJ+x%n-hSzbaN_+bMeGU>gNbqLv8&lRhgk$Ri_51W9dnO;^2Kr* zhqDr|DwyA6O+O>IJVt%Hht* zH`kwuN5v=Hsxq0oB+6>ZHdbj9OU@z|;(%HW)nX)0eniZsaG93gF)FIumUHi|@IvjZ zSg;A_`iC^W43#bag&NQEjW;4%M%u86E_F{D`2gu-_C~4OM;6lee*KzVdMDd4-6PMF z6d3oei|<(3XGt;V^fAS*leM;7*4?VmDjpx&gA~|aOZM8ln)?rQ{n5gJXYeN@-6;g; zHgLGT(r|+|+Mls~T5!~J^DMS}>5Z%3<6hakG3jcYgIFPbC2ekvAB_8(9QMDDgnu6i zf9yyQ&ED^EJ!t~3oFr+Qo68n{SwUOqkts6+!DxoL1vWm`s*-((tbtSNzh6(;QI>w% zIHJ*Gc($PAV)EI>JuiUl9>b7W62rnzuE?)~kYD=YzdY*y{tN$eOJ@WiohJNlK(hzV zCAtz}*XH!|SDmcq5FR#*oIvA|5$h>?w*Szch_N-i69PeE*Rb;#9O0YG`qw~PqI&NXICEcMM{Y{mWSb+(I5%dXwwpf|w>~Yz>TCDd=)foAvnzCeiB3ek8%0 z?@x;Yaf(OydHkJ#wSn{(RiGCk33eeG=^pQ zO6Ul~#T}+?F343GIA#ZqCo^Troy)h-847Mqofjm>R!DWb4u#+7J5yoXtNStuIYg=B zHvvbq=t;f%oZgtlM3yE>&vwQ&iY_%CCZ=>gS~*6Dd*57WqwS`UeHX74$Lb+nbAtU6 z`~KPVQ$;nB&DZns3h&)VEMwfQtjFrAcEjbukEWdXm_=K`%%nAF6TR{1c)G|ElwFhQ zTiszDT#;^(TYCN3b7{u+Wick&~SIW?MA#|-dm~hd(S|oZ`DNC?rU6wC&*dZ$@K7CHoHXawgNgE zo2^vWV7dl*&#;08bS(x)=?}i$kV8np1Ib3S(1WfR-j#-v1q(tF@9Z=@?>i-0k2+u9 zXB0N$@$o?BYgIK~v_Pq3@0$_l)Y^_b<-iLQdEbzIh~PKmJH~2fcrM3RI|mSlX2r)^ zC>o;rgwe>lu^4q@Z<2;cbH;6+89oT_j*cRmqqBwA5|IsVlxy&lgEpOrlY?-cQWxKPr{N-{Rz=%M9qPrRCU#YzY)MVE zN$kN}I1hNvM>9V>#lV;ptNoD1bcQ)?Uj9Z}vb-un02@-@9jW93=Y zfe1V@9=nGw;JTg8wHBrH8dDxRp_$;ozHI?}(juZAHXkd%@cE&gzio{22j`<@q40?f z74j6oo=U&cjCZRN%YJ9oL+_*?B7(ykj+ET48;z2S@x%+>veP#ia=IM1x7Pf?g=|aJ zCg$?c{^(Ua)k)xzRMwfx^LL|T()?(MTjN>RhV!|20;Nv}q%BWT&QVX0{cs6~tjT?C zmnUvpcSd<^$;-+(r`@?!7ABs!f3dj3d&Pd9gFwIW)DkVDB+uJ&a!S^mQc)ooQfz3IjXB|ShBK(OqlM|QSit*bKjyHHV5HwzfORzI1?!CN6(4p6T^x-((((0 z&AhJOuqqoVCjiOOi>a}7o`S8P(z%pL#uqdJL@XQ`c;Mhj^VLU+*Yulpdt_te_|gU%4FMkKt5Q7&l@Q$WX`$@wfm)-FQ}nHE;gTsCE>-;% zyEn9aU0P7?2qA#sj}6*XzD+ zb{OL$MIdVYmv*KLVI@z*11@37%6$9Uvng@VYuSMdjo(Hky%X2e9(&od=w4YovSYQ^ z8p3a12Z8owe`Z{Oo`)#dXI;bt@jnpZpYts+{%oCnyLsHDx)1c~oyF>5=gpG>=X|In zP};7gC5zgr3_zS1k;wCG%oO9C-v-(w<@s~8&}@D{Uo%crsh_xHbLX~8!TH`m z>TxdU4aZ%6S1pH^i9k$UiF5!K)AWs{lDrnvG0t>%$1#HRQ*#qwk<`INBV!pmLm&eh^PM4W5+F z>f5HlIw2mz02wO=)>4n?jNccO+yvvup%rclX|rJTH2l<$lB)@jd-QvAiA&!>tkBWoi7GhyBB7`K0OkvCbHV``nQXi*kVqLINc{GJ zF-bww0F2~*WUj_bcN?7qR{KS9f2w}6B_(OJmcvYX)K8mts@Dwyd-O4=oxp-&+N3&h zxrx!!9xkXJl3>39JLf=W#W{5)uI7EEr^xnut3nDcng+*xp+>TON*;Nh?cP+x#EU%% zet%Y-f-m^gvg=>}mR0*D0|fyP3n5iFehf`)Gld-o1dY$FwV#Vswy$P>V_xO$L;h}e zVTeO|cZIDSrS>*8QTHftVZ|4@(E~gbHCAtKFsy;5Z9B5TyRu*n!djsT5iF9^adYva zgbO{m7s?g(@nVQ{xBOfXRmUvTk}qz!R8Az>`fP0Pz-0d6s!drNXk@iO0caWksEqv}ZQ0Z>AB3y#feg_Fb(Gs3;d(;V{4vP8)t)#(|$SVOWkNC0e z^K=;S4)9;Ziui(vEdr1`iby8(L-*)0OoX#w93V?iwcFu+G&m}HaG|58B*3B+|r` zwcj=M7qD4tNp}QQQcUah+rBpd`nBhSa*EcG?k!I(4H_@jG(nt4SS0p>xu!rOWQnC|ge zUqH2y&;Uoeoo^ocErLKAdZJA;gKq(_!2I~<@D=`fQ5-!K3#hMi*-yWNQo&eOnoW7A z&mjQ9EjR#f_^RM30(}Q?R8;;4uHmLX@N`%4FE&f8v!>v5v+7amii|1Mxsc$-C*Vj1 zPaMxYe~+^I86S6GPW6{8TSXCs8=ycpNKo%5ZJh1eMbP_dX59CX{nFH+lOmhwUKvm-qP6K{ljJA2*&sWJb%ci^c2C{`R)Yc_ z>kTjV<=EbW_!BNMo&Q-|fzd!*MFw9!1aiE zBYx1M1kHmah*zyd^6n^ zKy;d?k80-?R|vyoGRGY8Y;#~^<-W0?%JF*PszmhPMD5gtI*z4rN4qWV+DBFoOsqPw zJafq9st0;0P9hlOo63^CRKhCYGom8 z`pvZ~JJ}gBgJgWnGRvntOupfuW^10Vz3ysK{4=xoROEPTdjm87$>kBv(&EAs0VOB0 zPI%OKO%PY}#C&by6v=3rEuE%E`J(Q|Lipl2@&%A#rv5jfN-};$#l~j+;*{YP!~4^1 z!%k*_l)w><#%Q!5XbD_c@j0rl2vM9u!8(<2ukRB}$?6YD0;^1k)06sL?-N0sSCL6I z575jpj7)SHg#w9jnW;WelZ~q-+q(jY@|OZ$SEY0v3tV1oz)2vqju22IhAA3nUpl=; z5X3n3nmdS1b?VNZYpm&Dyv}{9E0K&ynN%#uMRs7-YC(f0JTz9oVMyBt@KXyWEVL*s zc_owYX_{XBf(C~oXe42D0UFpBCUeW;@@MvawM`XcQmmFp8CxSnkd(nRvYs#D-6YmlH18A!xk@qh|Rq z)H!dq&8$arO~hj2^28nyFIzo^Nj;s{b_y*F$%_c%QaJKltHx2ESvV?`)d^8MxC&wI zV8NFM@%QVySP1ke?vV=KS6P7yto3IAnHY5R$uMrw9zX!R*R^!n2cJ9uW-*X`v?UZ! zK=G&4&Dn7eRq+04OQC|;0s^BxcG-(50$~pi%i^2hrnTD;5lS_hm)^SI9-{4R{dDnw zWyA4yoG5L=!e!lYKd79`@l_NP3Htj zk<5oScXQTZa!ERcuq!!d&35@#ZM_K^deIb2vS^*H_^Vhjsx0z6p`U|pB;dllvAkSy z_lQN_z)bqr%hieVwgz85#=kIOX7iNXvh~e8Kj^4UN9}7@{IXg{!)r1>uE|1OgN0AW zBB(|=9(NH|?Sf~Dr2r-@ufVZUUF$d~&2s%~e54F|vQ-YsA z2fQN#h1++^iaa1@?g-3x)6H|{Xw&al*UL~obekZMmB#G5=_aEkz8TFUJ^|ESA0+_5 zw#*weGP0QuC*IuAoKxT%XK_QCIe6HtXQ$}xnh|+~LRjRwv74@MI`eH$$(UCuxC!kJ ze1B$#N9#-LN<$Wm@{%Z15P+efk{8ZTaN`Rv+49o7X?04tjj9*?nXm?-!R=|AJ~FqY zi@+F1uDp|ESYACu)Q7YC%QL7Usvkt)uB3~S2aO@mH;ESBpVg+DHgjlrjmr} zaTpc@_;hBS0G}Dr2cUMYX;}(HSzPi^?84DYY3SioznpcNFUtf?7z&1u8V4U=HrlqG zZ>Tp-_ zbWL7Hc`gEb=#PHoh#%hCG2p7k>D&e2`7}QomTzzX?pNsgAVEec7=ZRef%ZOfzcK<% z85H}C5d=h%cL_iz-6g<9f;0jCda^O`AKXmdr!Bj}XTxkBIu+dkuLRtt5@)&WGgxn= zW!?CN4k>FMaS6G5#vTbzGTEmat>}$u?*Ef_{->5n1O(%!+wgH?WAVX~QlJq1^@-<@ zuJbn0Kl+j5pCD<1&S4!y$pa+Nso5wzFR$1m#)P@-D*|2X zfq+bRU9Q|B`hRqlEY4T-KpP+3WObr}yMfQ-#0r;~e#H`Nj_k8|*@)CesIE>wO@h#`b0qXs!Rhy-E!X|B5_e=Z@M$AL>5i zKb_-vSa)y5?V|KCtI)~gJ?-EifCB;L5?+D>TCZCVSV?0YG*s+u89PMW(yYe5&RIWC zm0nEeVqgzmc^CY*Q1h4U*l*^KJO9D_@mHF>#e?aR00N75V#9x)O4rGRz|>RFbQ7wX2HSRQ%p)$#>>66U$HKIdbrZh!heXP*=3&;41_%!N)OaB^fnwH(s{0povc z5z6-K|ItZ6%kfV-iHQP&0$y#)kCx+GKxFpw`>r!Nl#e)gi%*2!ADr|-k!*Ji=*`U{ z<|+II8y%G^YA}+c)VBxUUpu%DB4ckHa;@C*TZEx+!@exp4cp-;-c`tk>q@SFQUuB- z!k2-Y0b%DKe8n%yi1!zr**aJUZ+riPMS)flh67u|F})&;fo=bxBP0K&aDRLy7)X$O z898vpvJ?ZE0tKv#BWcyEV!znU_KDhap)EVdmGAIhe5OVky+^I&Upci(|LfxP zxtS0B-j!;$XEj{Q%cGW1Ig$HhB|+*c4qDjPT1ZInlw%kH}w|H}{VrDIL|_ZJh>gve&-`l!Wx%a9Y2B_kR5yf1m*WasB*%ZpEBK<-{21M{D+N zB-8F?z5CNfjW}OoJ{>e!se5QgA-|>Xt#OlXs025g(PPLrtK#8e_pU9hv6vG=3>kBu@jFsK$9z9W8X2`OZv5}!Ke}1gapr06BonabqfJjYa z@Lr#rF&(NZE7153GR^A1h#q*OY1#*z!3-7}@9eTJhLNzA#=8q|v5`y>Cl*tav3z{! zmgxqQO2tr%3g@CR>Ym8S#D*g$f(i+xlPT?@q1#3`KurCKz>a2$;=*LkiU9p>)xP(9 zFJ%~yo?bsM-R4B3qO+!Hc&}Zue2;@gFYMyXR8;lmF8jlW>o+HNa+?x{R^VrG3a^q-iRQ`E_UnNz%{A*^ zdVM3clI=b`L!2it0BwGG`3ctZ7Y3APB%IWRb&0JxFWKKY$8eZq9c1Jn>ce^CFIUwrCZ9T;Q1{LF zlu8{pTZocv+LClkX+>FhO7`4@P2_7!-$%P6epwdQCl`IseKm=v^X=;l;u)zrBA&KS z&wtA?=6Z$5;JSZygHdQ)%GO|Xxp&(8ytD49J2753+@ZRWJ|M&h#=8g?v~A~qhSu}z9i=g%GS z0)^b8%$Fizw@|(JIotq}>0#l^=Jl|VmQ9l;N)Bc*hTt-lw7%hxE2WYEZ@XkidDMfQwf5dX#VpvA|Du2)m-v+h?~GaFRhW6xoD&Lo|fr47Ot2D1D{*{g@{byEi7$6UO8w%ICa8E|`=_+w$pZQcB9QZ*f7o zL@`Qufya}e7|}7>W}%{gtPq0?M^@H8Ca^i~_}J5h(X|k zUEq6qMnwtm?m-%YG{IJj20zd7_PFf;*tsO5A6+E(t0M?e3@4KZD&+t;`!Cx$4yqH0 z03-Zs2sbu+-sT0Sac;+#?>2h23P`cn8{5RJr9R20T;Awan`Bnl^reT<1eFvKtg1Fe+KPCk~1=8@d+UKXp^ms~(H zOC}-3_pG)l=veetwLd#!Yp=o7!9n@0z7+!y*WN3Ns-MHmzw-;o`e0|8&*N?ZY|Uca7ZhG7`!%utAp(N&J;k2_{eqT&*WQRa45MQQti4iRU%;7m`d z?5d4fDMm*&$Cr37CMPR5jwW=G0j!K;f5V;sdYi~a%Px)-mr7D35;Uu{B{I%6I^9ug zjY)Ksr3RRm!5fGZU7S3=xQVV1yN&vkQTpY}s%cgFEsP`MP#b@ORowOUZ#|qT`~u%W zjEdbCT~*2cOA_^a5&R!Y%3|enJciz@> zCk2CgWQvOnUiu{#>CEanA&iQ6bv=@jYpr|xpS?PLkv8-?hfYaQFA}uBsjg{wg4%EL z=OPSP#7x#BrJnp}fF>CG{dKe=xQD`xnTk1L1 zdjId_{`aE#FGgT@H^BtT3{R{J67y#A^?l>C7I|lN zl0tWONR_WEc>gyvn*OPV`Sn5iB~tv07pe77&9Bjwx}Qt=ur~uy8YL_)1sW0PuQh`X zqGFUS1NDbyHhkW8w|A-7M!o`;D7jF)T=}pXL>d_s@~L9okR$m+7r?U06cSyQYFYEf3sUYMj&xKClA?oq&m$jyMsPWrok zpUH;we}9ADtBPNgypngVsH2!kZ^BoJhnzkZy=C#q315lhE)y@qzJM1dV&QhM;O%*2 zq_`m*hXK_%YZo#H2szf$%jf-=q?)8-#OPN0&QS@gN)>|v z5rfqoQt=7ybkKgoKkaY)`N99+!lV12>ac%pSs;63{e(9_2LGZO!Li6T>nukdF5*uNQ(9s=0Nybx@j)un4 zH%GN@I7#nX=fcLi7`?~0h#T%PeQlJs@(ZU=(F7yT76XK%ENGurCy4P9jB@pgiKe_` zwX+ClM*gEWQcxA3gmMU$Tl?J?{h^xr>7qQ-!8`m*+|3ikuQ*{%KG;arI+z%{tc@`& zV3?|RtP`GIisO8R>aASxCRJU$L-qkk{Qh!OertbyRa(K56TmS%QNdYZ7vB2D^&Sk) zTiL^{Dx&yI9(EMpj^VaybMEHiNG96a`=`$%JrN2~nY}Ry-M|9~>Zs$6wCLdzMQr4} z4~8c^CRLed_(9U6=Vzo6cz3!%;MvXHFFZip{F6|aJuiYni#!~NR+0E(FmRXNHvvCK*_bG zYl*6pkdc=O-T9oVNq5_LnUjKvcGEgUFJMK19BfEh00~ApZ61GqR~bgK9y1l&{F*-X zbwKo|+wVHAVn368eirlvQMpxyfnCS)&kb@+`XV~AE~Y++L$>V*Z!kvEYYSDFn_t>_ zCeVqzeE;a<^Z>};;E#3JZ(sOd`i-mwTH}Mu>T1zGVcLh|859C007z$ddM&;YXgt0( z*Z*_l@z4iY6qxrf-T5bh{MT|cfDay^VVi0w1&?`2E7=Zn=%A@;)D=e}~gliHcTy<18)BWGep zmU%^C<A7l@9T6# zXlwd;d$c%By_dfzr9!*mi0H6(9tdEP`BNSE*9G_Ig$KF9TNInY_ONsChcv)s$qDDJ zh4*6tSN-WjhG7EwS2A8G8ZEQ%C@#E$R4=%ZUdy*g({CSE>oE&7tkBq(ajUkz5mvxH zh7gYifIK2tO#qq<2f%!Dqi_tvQQ7hGb2xxZ24ndi;|pOG0Q!x&5haBu!2!j@P#qvm ztV5_@0s~k&G6Ff=!DyoyfF}R$F7)uyM-e~3dO*Q01U!{;A_@SUqnqjApJ5#ctRxtx z1YXa&_Y)uxJ{NJ27t{x!lkabbA@`UtE(9u-?;u*0UzTy{->dldUi@1w{yWadLMpc4 zgcm1{E^C9+`KD=($=^OSwMp^tIXhTL7So_?OlpxC6Q|_!Ex;(Q!^-|n<~@5gKD+gz z{cCMYlJ5*+SPh*#p|=qC`}Ay2F5`qFbCHd0sl&_6A6;SK@Qomb~e=RHF%2@(3STL zU>4@vo^Q}c4$0tHOITBvz;S7?ws#kDx!ZMmyDMgC(D*PnafGzp0N z$P3sJz~u+Z*A5r7&tnm$V|xQ+T^jkTjA>J0wdd?HT;}0%Gwe~V1r#aQ$$Aw?K!F~A z=63&q$>H!ixryT}$70YLozOF)`RCxRD*0+IE;8KY&S~e(y4`vVUU1!)3s&(1?ZB{n z@J=V~`Rt8DYg9aQWygX`c&)ClT)TH=HnK~ytcR{{P2(1W21kX8C_sW;u<}O^NBGv( zRO`6ode86$f8QyS2Odvu>7?Ed1fBj&mKi{d&w}_dpHJ5ib^r#?-pX z#Hm{^7+_nfL;^jWS0GsFi@`*zRexg1|FN|FIwT<*MAi&jKN#u?or{N3Ay72<6mCr3K=X7Nn`5od%HqA)V&>}D zpvS0}5hr^@hf4qUhyOvKBfpD+Zjzw=7h<;~J%NgL(6yS0E%{4+68L+ZA$H9JS4Ege zZ{1nD?G9S4*XnEmGDbIllkX>%YPY-q)GDF@z6wfg+0FOk60yo$Br~!54cablENf*K zLW&ihzM`n9sTnu?JZWOq;o{=L+Ci^#@m4JTJp+ALpiEDu4odwC$69{k29{_JMibx* zs0;?HYj0--Fj@(u(SC_v1x@psEDe#u*PN9h>b6T4ciaX*et#Yik-flO!e=+pp^e(% z=j63fax<3xh)9l$8SUu}$U(M9_R*M)K5~8C@+IYu_qz%th*tlSTT2R#Xrh@zoA$a=J>+S_4o_=g?5w;@^IN5Q~7bBqn^;E<=$a{Po zd~uAx06>yT?g1-!5_Axlu;KY%(2`_K=3sfiGY^4xI2A64Ny20`6niC ze-xD7QFgPmpje|=4U&6D!WpAp(=1@5G25Xk?}Q|Q0l;D7WR*<$UcsV$L*&jG+K-u~B;^9N7(mrn`s55)gB zvpoNEUGOhNC(xW+3F^fJ4ftiZvnxX*)3>8(;w4kYUPwxUOwK)!28+9R?`j=EEL!Nk zF)#i$Z6u$A65Z%SSI!N_$-7{L@Uv&02T{rw}$yjuz|q z+J%=^cV_ULtqcIX{W{p;FNPbwKAUpDh;#pgxX{+)DodUZ=!lpmrXt0{6-&O6`$mnG z6r5G%pmtKDv*d$k*J4AYxIq2^4n3?ns>2L-C~CyIM(2pKfn8$%I}ckipf=XpwZSui zXEL&#N-R|=iOl{kIkznfXzvvb13uv)+#wcMEdvaU5`VfMfZ8M4o!Gp2a(5M0dF=zrh1wh^GZA1CIA2$qR~@15 zwae7aU*~i&l^f!_kzY!`Y^Z+D$mPwI_W8GW3uUn{-YPmT4r`2kpasCPyV2Rlk~EQs z!-eml>Iv-wZ*f8(fV(4tFZv?aoq+?aI0{EUfc673f|#It4q!U?b{D_{m~$e>@*M=p z`@!VCL$hCd5orEhgJQ+NO8_GI*S8$PzXU)Z*}6bK3hapNc={Rqvlkf0LIVsEc+@e7 zXN_n_o&e1mQ!9i|o$$dz_?##%6z|jxU6G9^P(~a@;N<6*39&clUGPAuV$&0DYXw~f z#se`C6bUtp;5~LA&?3EwYa?*Dd3*;czs5)raPWf$Xk~f!?ivhB@f`%L%*>tUm^uf@ zU0osoCXq(E9^h2DgU$F33WA({A{}0rv4;%z=XHbbR#CJT0&<>*E9UQcyr4p0-5=lnZ<{e-*-p}oH z&uDIkn(y0lG{{EQ#QlPfU%f~nm`Ed)Hb2p`rib1b|NyCXqY-U=rvCYz*i_*WceYhkXZeIv@|W2`O};;sEJyTHmhn ze$eHB=YQu7ap+)t7J6|)7?DVnh6}O7EcdA7mj|g+>JvIS7-3g(H!{F++_T@;=5pStd zsx%bh>PuH$Q%v7`v=}`x^NgOrH6J+aOsYw#J6OgXKD7y7V%`$|rglH&W=;+%zyuD= zU?ks$@m;Q~WpR#|DYEMt{(DmOUqZWoDftvBKFt~MZv2ao9Pv}}8lQ_dlZ>;w&j8R2Y$}-otv4N`eHf|=BU}rb#{<#f`T=*qXxsAHxo)#C{ zO|3g4fx)ePdLw0YeRWk$k9otbKY2Gu3$#6Mp(hf_F8E@9jd98_#mdFn+WM^7w3%6Z zc?YHHYaLp8(7RV45HkpL`BcZBr8dJ(T>+S!DMd-JG%ZkS2_^I(keN7R&%ri)-smN8 zCZso=##Ll&Xgg1ZXq%q}&Hkl81BmHPl@@w=Smj&HPTG$+FfpRlgbbfGaHmnfQ|~NZ zyl8U~WE^~E@$mo%Xu$n&VvVIccqW_nL}V{|kR(7Ey_cv2Fd!Q_)uwh_aFn-Bx&4-! znEmy$OtE17_hg?vf4gTyB>NLz9+rv*4Jvd^&OZuxOBaynbb3>1tBaYq{wGf!ZeIv> z$d2&Kj(>o4e1{cUZJ6nd6WU(hjhQu7Cfsjs{|?H^F!9C}7mK-F=UDi=Na5En2}}g| zn>P@P3qSnI2`V7Db3N*Vc@mzGU4(mq;oye%h(_cc8xMthBW;u9^gKw+=X|(nIx9;|vpA^pj2<++tPiq?8gQd+y*sz8YgA#@D2Qk= zue=v%`_6cJ%`dQ3+tKH|_UX6o70MqO?rkWNzN3p(E$o;PKf5Pz&!P8;wY^f+$c)6s zM9&L9pI97U5jOm=HQl+<(XD8;*K}=&dHd-pBEA(9#pCfi9Xf>uLt#mYGeAF3(v_CE z)g^{i>7hb&&&DA<(K?&fH|+*GHI27=UP0>u1@jA?rR0~d%0IICrN0|nMWHqY8o4MN zg*uk7PoA{sd1N$42TEE`YZUpf8@o91oL6*X?W2o}i4miLyc$nPOxHCNPc7m9Tqt0` zkjFS;kv{aolqObQc;0_gEV1c%3QI(b_ItZ7E<=XrD| z%)lg95?N?jd>Pv{R{4>n3Jvd$;c88&*@#nhAn&Tk!)-2;S1%yzuNP_4`A1`o3tMKc z-bYuzms}2SS4nmAw%>Dn&4`$%y3l%j`*{1QBzf^eVeg}no|lF7A*F6>{bUR3)egSC zvMCmcYxZg1u$d#YY`oOeUOfIhjTKIWGnI)E3xh4|k&N8f(A;pVV9Oh!7$Fctdu0HH z_rcp^el*khk$CZz!NoExpGR%whssf>`^_U;sQQ|hH#;Om*x9vj6iaihT83G(W6ZN} zno7L%OUPqi96sp>N6g9(5xBAUX&CJcC!U#_t8qJ|*%e*9+@W=&k~(L+{=ILV)!TQY zBWj0)7wc}+HzQHS0TmnOZ|`YgjlPMJ%`Y8=zZNl*^eR7j zewq7}E47%Ci{YCbEd!nkO?=nOkXCJQWI2M0Mca73?#H*J(YvCY3x$8{urZ5Wwi`a>`kgWB$y6ss9UDRz@@G| zo7(pw?pVe~ls2p|E>E*Te3cV#D2)Un99!DUGWg^Y$#%bUDY zdE@KRQDbyCTqLH*xYYF6+Vu|D)=1q}&P240Ssgb5sR$ZiY7i7BzrHT%yvdP$- z*s{&ZK$m(vYtiq&vJ+;RHdJ)b-~{VkIw&bx^`m|XDJgL{XJTR|vTFOXm6JL%X}iNy zzwqo;kGC6XdflyeG0ew9ZK%7;G7l3Q;ao4!$WCZDL}ZLK8nw-0JcATxP=(k{-5}K( zm~)w<+gW~Hs?L&1M+dc4f8$b%gDf|`aCP=izH>Y-MRj>(ZO`6svN2%8t5)jTxZ%#s ztTx-Z8{uj^2_bJTvk0jH{ZJ7LW*lsN7z>bf=+L3e4oGqwXftvFUXDl|R zG`xqMa+`*}PP__u1z2%VD>!&Y(e2wTHhdgPH5Ib}pXR3RM_{RKp=ef6 zG~rARK*Y?bz2P(`Q;^;N0sWQE*LlTO4$^Z;6=*;)?xOEdM_VFvz)cB`RZxf98nS$ol5;K zs_`>yDtmsSb<+c7m^W8)p+ ze;pquP|Pl^yjd?|A#?)t2j6v+aMGn>*cLw6nS7jr#VaWAMuf#B5{B2g(K!cbsLwnxK&9d^yRv#^tTu zZ0!p^e4I^1*v0qHDl*?E1$B3tai6;qP@Ke@7u7YE6BYE0hj()#WvUB8N5Pts#|luw z9I-KVYE$C^v6|?u<$0>?u1yua^p77UDeOI0bj6oc%R`ySm~AbucK395(v&34i^WVc zzjtqWTi_3|4W#qi9AcL?L6u7y@TpHs8~T{0wOg3Y9e$+28gpF2C=5ndUs!ovKeE_V zt>M1y{`7uiD&$mIe^=%+@=tD-20{&4ZW(y3SV?t()yZ0Pt&h*xhKF*Pn{l=^NpAx# z&KgI1V$g%ShJ7sB85Uu|n2+=I@_ZiDPb!7a>>2Tfm?z6)8nH$7R_-KEkJDRcUv3We zZ%-X;PQcS{x;TX_A6O2~OdU>)?n57E(;-mK0oO(XtR>%>S*lvj9x(5gJ99}p)pKvk zUpbvfY?W!##I`F$=;<7!d^x^)1OcSBE4mGDT2%0Pd%auc#_!5-|JoW`7(nP*UGN0q zZ~@6-f^}?_-Fg~url>SU&O!B+`IZ={ulC8^`+-B5DJ0^DZ;5B#f?kokhzjkktCj6j z)Hyjh4dtcV=bmA+Bh?V+01+nwU2aUlaL2OAllkn9Xk;kBCw_6tH@;mqFl_2c+bX81 zMm?d)uDkZ)BVEKcQTxL-qWT4TqK#XmB*v%6TgX0uY_KkE92d~e(T*qtITo%pB$2_x z?{u%gH19*x#Y9!r&m`uF+Qfyt!4fO~d70$5YG5Z2b`XtjPC8LrfmL1{lGOMBH$(3= zb4k_Ai#u97#|k`#7+(>(f6rVK2UP^*FZm< zgWJZlnGT>ik%pto7Xri?2CAoRx41zK_^Y()yA=<0AGv2DTRaJ-1Vcz;`|Y?~=^A4{ z3OuS+XV>EphLjkj!`kweqqJGDQVYB#LLoHlYsXLWwtR1$S7lJ!4&@RLE4d#Y_|E!3kpQ}nK%ixL9=mnxxsmuE`LFo$iUT>m(8ka92rr|Us(6uuqh#J>SfjiWj9fEFurWs zWeHbN0~t&0lOhvnJ@m!ycAM}8JuLr%Hj$%g({5l|*RhA`nzF8Tnh$r@1GCD{s5wjc z>g)Zo^pCn(MwYQ^n+7JwwO&VM?fCdeOqKMsAmG70s7LGRK51wbOf}pYcG2D{?W)K5 z0`E^@9&thj-DTKgsXd8>-Q0--0?EWQGJ@oB5{@4DQ4J+}a(WtI(m7Qm_#Nb)N1(Uv zMDFNV!PgfN;uHtjz@M_fWmAkEtut#Aqm z^E#NiibWEBYK7W%sN_Kw{Cx*o*mM2L7xbiLg~FL*_jYhR=0Y;7UB)F38-2tE>WUOv zmW`i;cZ?bqNu)XPfa#zd_t2D&}SJnLnT9y#`bl{j^`8v zfUyr}xyy%nuN}&!bnKYjITNd&9T}BZ#eE z7&4|fb9eclO)*%|RQ#l-kXhpIXF6qi*kGa3iip03qz%u@L)z7{f}cvL-7<~KkaND8 zs7hJ#xZaoRRm;J}tR}=&<5@d(`O3J}EG7omWUR-eQJ#Te+qhBKc-oGm88id&&fddx zunT&AZI*vDfCn9q+Gys>Q1xKg->B6fsd5hud7S~;A@8}D$##m=F|pLiFwb76MR(b4 zX0{hd2fWyvt79S#`>5OR8_)cHt@M3Jbn?{frm&G#+=P++iq|Y9rXWfLC-=?~ejdl2 zMK7x-ljQZ~1qb)_&C3_>US4$7a&@ewh$<1#EsT!f6Meb26|m%$ICpu$4&^SdG9H*$ zvb9sqyOC37v{K^fS_5W-T=wnd_Zo}9Y@mCe%MHz`yCb|AG`J3vP7`9}W>M8sq|INa$0fN|~JlvfT zk-P})y5W5b5m@D#sZ-Z+pZ%i+>C-Vql+UR3Lr4Wb_wABf%}tqC#Kl5ljay}>9HeRT^=tP~LUp7(G#KfZd!Jc!Ez0ahBdwM}9IBJ}O zqkbG`kG5RU1~{S{$?9tmYRr}=F zw%PMtxa56!8x5X^eYw5th|dyeX@AdiNOh5`qhHz1x0-n~u+jN!BgNObIW zMW0wo4-?o3{eIBsYTvw63h4q0dSj$etPkhA7_^w`^821LLD7yjlH}H^10Zx5esO4S zlrS$HAGiMPi%oL=LeV6TmN23|U^Z+hKo@($lWe*Za3)Ig+5P%Js%SkfyyM zJah?e4^wpXake=sF<7v5CkM+|gjvO$7+KzX-bBJ|knCeGIjw;(;{cy^P4+DSKm9mV z;$nM7U1UmqSj}rl4p|as`h%FdV>hp79?v%yHh`*|cT7oL*S*z{_|=_zC!!xADn0d; zZ86RWCy_XHzbZgg|+?xU?M{u3pk9XBs1OTMmz&^;BqtwOydDYl`Z zb!V-{z9|2L-qAw~45U^xLPGF^1N((LV_9PcPhN&uTDMQ$y%PO3W_g9m-m)l1b5zH@ zR;j)(^={7ddDN2>;}P8xJ?|)>{Yh)K;1d-Fcs64!I!g%1guoW~!yms1j6M-lkglF- z8ZLLac5AS^IPN-M8N&`xX+K}|{KUly>V2)cT5})Hbx_@!>>9ucd{Xw=a;i+K%A<;h zxyZd>tWenB!5k}z5!l)8Y8Ju>;N&m{Q`j=okTK`E3Q?+L9b8l;#Ptbr$VbmpWkOx* z#_838$=ULWAF9;s%kuVRu!G@+L%mmwnPPL8!t?mu>!Lm6a>sIJQmgw0BX)yE z(su)tlWq_O-lK6>l+)`X--wq-c8(>5Oj{g#ggC7AQSRE~9n(^j0t~D70<{9D*yh>J zgqv^{ZrYk^HU#wSXSYRdWuVCFYDrj^RcmW@Ilsz~1jnMB-4Q+O))Udjmd?g;`le9}H zChUrwaYx}o#Ru)nkT-semr1Efa$LTQNbo-h!&;+^AuKc4h&Df}@Pv0*kKU#VK(89J zpOW1&f5qm+9j^=;fAQ&5RI6;&o1Dug`6&plq_$6d{d`$%B>x|K?;X`tyKawypeRyA zL^=^^QUs9>fuMYVfPnN)RHO?C(g_4ndJ_d;{gZy(c}odA3}BR7VuUf=z!z~h&R zMX#9O8ec}Hq#`QBERS=BI`qR@D4K}&C2N6Z;oca9=VJV6ZT@8w_6g?0l_@tkvjjA@ zQW}L6Wai>~aieY8**8*jG~L{rjt*7KBK=YFdAXyVSV`v0?9oiO>O~=kdS6rhnM%`j zJGFVn=v|l%E~WjXwm>*}n>R=+C+np`IQmXUZvN*Im8Hpgm7C_-*MyV_r`7T#)!ub` zF7@d4Vbj+xTD~jVx)S^tou3CH2Z`~t6o4QG-g{3Pa(bm4dEi}xYtknlgK!lCh_P8f zgz_`2;w&Vcob>Qaup%2`^Vb79L>xX}hqwyMVP2?cx@n5l$RazZr z_)XE`bWVbJleHuM#IFPShSg}XCwHtPP13RaQ1<9{qKEh}pszYf|A+hn&^l29Q3=4f zzR)+E5xMgWpbpjbp)$Zg!Jc5*Zwi3$S~rlFf*e`EkkTGz*vOvnzo&R8_a(L|7iL&H z5!H!pLyZ2h`U$6c&9#kM+e`dnp%)%ATvS$tga>!sizXmCq$0pY*UZKuI}V!3oYLkgl{*( zh}rUU^%5FfO8AVgi#o`0{b5Tn1~!3&A0V&0Tvhw*GA!F;7j4Uru5?@iHS^~}BKhg3 zj)+$0HOj;^#0z8$jVBgaYYu~aF!~59?s}hF+ESkv`?1x~fE8qQ1+e>t!5Ko>2b9wq zi(NuQkwN|k>k={<;tp>wu@t1{(KtiJUjBIy~P^QEIQ6HRy5p8O(OvkEA6?YUmQVWXf1JV}H!sKc*}r0|x2iO>Z~4;3JA{q~SKnW3{Q%`I@qqOfKM z4G;PjVP>Goyz8UCD+vSPKmq~2~k=fXM zXa9Ko4Sp^*{Hx*DP`Ru{$@kNmDhig)0#_kX8dJv?rQdzEJmZ(L%~HUuT{B*I^pOPLJ*Oc zYZ2?$iDXzqJ>vBp8~Bw=e6{X+eLZph9pyTQUTKkQgGpL_-mIqIlVVJ_pQ|T(uMS-d zJGcHv!;2@wGzsL9Kdq6%ATF0kGOl>WP8HGGVQ+T9wB0{OYcy-CC;Hy^UM^s#=8fC7 zM=u6F38(Nw(5W!O#EB9M2RLLK^4vNP$Jjf1{4Y#+K-AjkPp>_|9j$-%fb!SyKa%SH z2j2tTBC!xn@U**xHR2Q=*#DWEN2OZ})@NC9*>&Q{0?Th{#|D`0zbM#F!i79pz2fD)W_CKJoi2@+W!rZ@}GvHh^5%{E)NVV z!H}4a1#$ZCw~69k&Zm|7AAPg-4;*+r0haLS;&Y0C{T!j}QxNHJzuTduO>vUG0P6I( zK=L34k?miljBQQ|F)> zV-W~D8DxMsm%@%nVu|ZtN-yb0A-{bOD<5|rLPHc?dXQSSnqOh|L=Wz|EnMdLu$9F& z(=^oE2Tk0=;sUUQWxBG%3J<9t#ml*aN@H4Q7&0qFc79MGRZq&lcaA~OFxJ*6WaC&% zJ&+68ZWBaiW$ccJbcz3wMrOx#e&Yuvc?iaNK#QFVLFQ+rDO6UcI+F1rZ^KT}sC~HS zuBSNFncjj(hRdjV%--SBo9dAG`x4|?@sN?B3YZ$cC{l&NU)`;+UOm|1(48&n!tftY z@84DZ0gHRtEV5r`#A%y*A>lU#bIYt5xgihH9W5C=}#1fd0Kw*?#F=DFFq+ac+EUYP^QEL&&jW=JBw;p4}i1ofFY19 z_!)(nuNiq+hji;WR~Eo=VmX0^O5pfCv>Hene>-&bHw6YkPE)f!J|e|#z>o*|fwQcD zcdFT&-xS{wCoYc3#L{{3;<@wNFoT6%4*#r2^)}*YvW@r(uE%}&eQN%LVG505ue@AV z?gtf|Oz-qJXlq20-Fv2*Z@U?4u4U+ho}ieADC62j-`Hri8ppW_h&y~*!UQ$XW;SjG z9<>KswFj=Yf@(<0W4901ep6JH^Id8{32REgy~2D@lN!h-wuk@11__iE{1q(C%`&1X|5vjy4Q9u@;01#U{SyntQK!lN2r>s zlUhFZLeFYu!xU}-0&q}=E}J^ZmO}Aq2WM-Wc6S> zX7Q%EQr2je!o8;4__MjkIZelpkrnR|)rGAH z_Og6mc0WYb#4Hn-SI|Ry5cwYE3Lqfu7c6A-dR>UU-5}DQ@%ZM6<5{o`fQe34NJJv1 z`2bQ=Y8wXl3Mc1Q5CN5VeqIwX6#!Q@JD~*X7C;5gXq%vPHd#c3E)69uR2*|Z24eZP zA!oOFJaDnCnUD^~DTF32?K}VV^x^4qcv)2Z(FPmU5Y5M*f=x|zm+2f#Xk}yp zx^y&yK!O4dC_JHbMPBD+Yqw^VUbc~bPa(>uVY~~8SWN%HEe6#so+2)`p1d$g8N48h zF@O1LNQmlHc2A-@Rh-~HT`t$a=XdEn1!sY|X*>v8&oj6^n+{D}&bCKjb^&ts(6?G* z<{QL0-b@c4+0IDqDjT!XXFmC*NZm7HZ#hgYO!>tNry&xu2)+s_dy z{dXo*6pcR%xmB=tIxm0R&5UzC%ka70TJ_ThlOS1*hib~~$Sj4ZN2U;PsfP`up_^yP zmmoB5uaqTrLVhsncTv$bX=e6ha|wouM*K2w3c1-NdH1AW;c>~H zib>xylq^~lX~SqaLFRy|;4hR4)|dNBX(EvsCj7Z8Bm9oEcchYfC@6EkQ}|Xm6w=TG z*0TlDbH0g#IGAqsGSMfz?P(FM%l56F{EE^VL~yVVSO=^5#v2q4XC?EvA{e%fv>Q+| zyibDcofmOF>qGj2XXN}MMS1I&bmiSQ{&picA&QCu>_xznzq^^t2)#w~|2)!O&i&kosxqp^o}L<^oIHVL248)=gS{H;DE z`k^1dP{^x8zRA+4<>w$F3+bfvH^$agN6~@za};`7qMeY?A4B-EN}?knh6pX0Nlx2$ zPH`6HJuSS)Ra|;FqZsnEs#Eiq!~1|fS9#LbZ;Ao#00FZMoUOH=5XQ4^+LQ;kRR*@}zsv_;V7*T$M1Pmv?n z>IHcn0uW4(*OYp{xOg4!?RQ!M8Jy671BrwhonX#ZOG25?4f8T@Oolna2}3oV(!B`P zjV&&Dl)XFx`AKrU**3(P`2=?`FV?!q^l2t18NBrTxYMxH+N zFC4v`sMyzby*8JtR$=X_zN|f=vtvK8E%YYvHY*ZvSPb_$eMZ#2w7U1D$?7~RVkf~9 zp`whdXmRJQ1I1H>ocz*sm2=Cc~0SoCH96*dmj8Frfa8h(B9D_VD)}tFbn}kXX za83|E?mhWGAdHs{$oAcfuCt<}V2@H~o{*XkrR4~p)!vo|FVtW$t(*!Fe~zuZ_fMUk ztcDkf-VZ9h?Y8QA*7dDS{@z&Z$d0#3*Rz+oio}5VxoAuIVVK)TwUVhXVa8?-n(;rn zmb`i2%_p_8bMc7QQN4fah0aA?Rxvw1Daw<{vn{yv(#Y2l^eLntMCE;%bMrFjoN8Z* z>Llu}z7M$aJ*L>qaRxL&iwHb$#jChRWjiV&FE>CgqCaTOLbK?#(-Zv+e`i9j?Rs~ElRVN2eZ?uKMCJ~Hp8&<=~{u9;iO!h z4cZDyKDXiX*eKh@B&JypQIfHk!${y6A#n%uU-b_RNE?})@GjHViLDYJt8$;ZG?VT1 zn?mB~`+(c4icXntlKL5yDl=@_QF}rT!0k z)3d`%O&8vR<7^gdn!iWf)c?d6Zm{)%R@l?QLz^q6=FHgY6y13I=B1diNj;J}5E=o# zy#P#TIr+|V;#$(L=M+vwT)}kal-9>e_V6cb@h5v}RWP%}l*I$F?CH|ETT3N|CeK#o zd`qmDUd6Gq=v3!yWFta>gq-+IR&i--m(& zqHi+tPJfN;+t0s6@#FSf0R_MX(<0{)&`QlV;hW9FAy=F;%Zsjao@#cSuTh9iNVx7b z(lv|!1UO|fb#qC6G0-TE`epde8EhkAs6?T-J`ZTsJ?g#YT#h4psz>DDB4 z>s5tKqM2&tAIIIiaX>3i1sG`RpK6lFa%s=rOWQDZkRdq0j=e(K#_Ex*oZN{jP-qHC%90T{2{)Gm!>R0KD&@D`>2ht_EXMWa zu2+as#k&_|zMJ7nr+TjXnbV)O1nZygqCQXLrUy8C6k7~E@ncKA{2tBz=-}CtFbJRR zmrq5k?9YcH^DO-Xlgf0_2F^t*HJUlFYeZ2;oY#;^*zM;VRW2zS+mbbV+8UwZ4?UjG zS8?G{1pxuJW?YqbwXFwjO))D&)r;=3C0!;?gT~F-&V^+I1HRjSm9>4kzSGSMjNz}_ z5m#Wkcvd46q(+7w_T@!l)W0`XYQA3-F<)^qG>a>SyiOHD^2mtg{aV;qcy~6nT{xG} ztOWXG_x;#z|M~ToM)arXOTs|_J6#8joS8tb%+AocNz5~L(zEKBZ zad}2W4=gk)ougnS@2=JvkTf^I!9Vhoxo=+q)X1k4IXm>+%;sT3!E>&!H7@&dgtC=f z)ZUK2yt{boGD%HG>BoGk{5_ei@dLLm1Eicdf`!Zz8L+T@!%2l_Oqv;jzK~;2kI1U> z$XNcMi}CWzc4U`X8!6gy*2vj?mOc$jvx_|;-z^uPZrXcS7KF*Db6pRD2636SSG)n# zt6-A~mq1Wd^srCee!7P2dn=2VZ+iqukI%Jp(s15`NPyYI>NaGG6upiF2yRt}$n|58 zK?m#!xpJB4!qx!UzGl)~A=l9;zqeb3l$zX`#lI?sQA~IzUV#!{FX(r;?p}gB*Jo~Y zSY3{Fc$=O4+59|L_BjRs2g24Dr3~h1nY2#4XxTt%Rz%{d7~@*UTJa3D@(71A_I>#Q z@qv8I{sh#|uVfOY2$)Eag#Cg`{OhnPdDz&}@dpzQu{~iF`H18@9p_&5unnH~ef%}` z>-J!cnJif=kZ<*SqDtSsi}#xSFSdon3$uxthR6U5CGl9Jbd<_D7vB~UBpwbh5m9Ej zR~shGdxD?Wwqa+TFZDbw2<1u+pbyiN%r+CS{^VIELewtLO4~p^Jo*X_zF!X2zdiKT zv!gBdZMsxkG8&~-hta4tYfflvwWXe{4yG|01^X>tPGoKQGO$`Ru43)VEZBav5PL2u zdy!k81ry$8T%`A?Lm&OpZyVh`xEip-${t$~-TR`Cwgf$0zOz1~)hIaUHh6Vxo-w}Z z_Uby6q2=y|CC_Vc+@_sO@^nqhVltmsJ1Ep>Iys<1ML~m)(f^6-_tt}&?X{sVW!=ZN zu6;uZpR~A9?Tq0KLTzUv*fG8O@5q*Mfm4l-#Wq|6J`!Mr z>fIL#8^CeEb@<0);rSbjtiFeCJ5zfwgHHMKTOAq&PQ^xH)x4kx(Y_&dzrx-g{v_7+ z#0V&^;t0>{9CshKxf#=$y{Q%x0BJC1G{(-`t^3R^|*lGeu|MT#jaNTrs^I;?X? zwl4YW6P&kly3oXWF*K7jsRgxnt3{0u1EX!5_lSsvXqZ$*G}Gni54mCiZy!=pqQKt# zOw(6It$k^8RZ3WLG-DrD^CV`8VwIuz7u%HU=}h$kxG-meopQlh;J zI#+kbr?cN#ZL)dgBU~+KGNwtlh{3>C+}`RtdxeZ5kDW?a_7M`Rj+N@_pSw-}>hhdy zrbE&k($YrCYzMo!k@uJVtD@kOaian0RRmbCXc`G4>S2G(IV%A03k$rsozf4%c|cEv z_BEP1Zj?I?d1f!?_H$4w`yg-l8;JT}sn6?Pa&{unAH_Z+jq*l1@RRfm~ z@^J7BbZpa^;agBnZ`YOUI+~P|rL(-kLj#6k9?bkaLD7R7jB&1)XyauF zX*nKqhbnn2o#WrK)r`Ou;c<(~+zKl8xLdouHKXeojh7seKX#*Mg{#S}`0j4bzBBq? z0fb4k^$RUUs<(8ZC!qm!^6SETxX_}tvk43}i37(CdsoiR(ONG)@- z%l=!ZK{%U=$%V|(Q&B5#e=g-N887R0miMTlZAV+F*rR%%EJLp;0!H6xSp4Rxdec{T z%nF~NXZEwF&BCIEiHSuJM;Vntz~LfsVTdF$M575fTOqa~;#dpv2m#Kg^iv*F-(Xkn z8Y|ukcN{hjs3@HTz>Cb!!%xmq)rcI znCct#dn3b=ogp-bjNxzni?iY%_MJ(GnWRee4G5g4$@G$ZP;85yc0uES2u+0%0JFW=pSnSv_sXf2g-NR!={cMq%GTAfN zdBLl~-fB}mCLwD1r=`e9x}}xb0JVRkzA8G|`NVlqSW!mSv27_}Rw5FQ>%ZeM zml$lM?@dC?O}GRYBdUsGYh0D!bi~p{m9i4z)2Y0!+}qS1H+eY?@@KWEsMu-7DP!=n zI=1;HA`_Ux2CKvR6Ka@7-`7<8gucD+MxEL~-Ij8eYcyE4I;^rOA95L!#&C1$){IK` z`P5ZahToo#4&-;DFafEa4;o+KFnAPJY00-oG8HVBZNkee4=B`Pb})_b0GHw)|4pdm zmMeNTXQ7PI0MAn)AJMK|ok`eGBVFkTpFy5CbfP+bjbDwe0fC5%9V$u_J#w>r6Yhtu z+1X?Q)*-e2r2fMNf1ysx#8+U)55zf5c_fZd5t$r)x zR1v|HXaVZL>|Rm#hKUwcn*x|hQ-;F@-uUzjlJ5F99``c%Z+ZS1{`3Y0AX{Ox>juZQ z$crj8Gt-VLCP&dSERi0Co31a8NkRZ-1uMl1k!e#;pOOC9e_TflcV6HWuF8Vauj4Ux zXH!f2;;cNQ!!-SX+6@^G4l3bx>{gMOB&mnI%qp{bl(u7+iv)ie0N8rB(>wlo3rDYh zQ#gUn`D3uzW8Djhj4pVWy28tLFP1Q`#kBcA{?a9EDjk^=1#`ie3dPpVV8_-#X~DKW(K4n?y!Fon;=3Y(mdW8PNmhnzhum*0i4 z^(Wsk=)3Y+k47+5#SoB7eGTbA92guzNkSe-;siyo{`}l^r(9)RMC2lZZo0WE5qnz} zV>-6tkCshFV+GO#MSk2(PAH;2r^rFkMr_7sEmZvQxX^>A!Xl#~Uk@(2->eBjew((& zT3ou=ctbPf0eJhzt}2gkKG0q~L_WR!(xhrc8X2D(yz|kMyQ~_Di{IF6v0xJsw;ZQT zR_(bF2n0f1qg(;8{iblPcm=OI987n_M`BruA_^z-(yXX+(E+f-xMDAGF@{aN*>f`V z4w*d!4XOfWl-MkZGz&$YS@bWV&05duDkBCrWPRc<*U#{4H;JV5Y^(C1fIkE;M;JbP z-$GAp+x!8!N*4R&6nK`bF3|aI($Da&SQPgqC#?OlIS*qX#WeBu=|d93*~e9s8=2mJ z4|KIIqO)YYU)Jo-7m(9F7?J5Ttg&;^9*sD!HMqW~YD;<<0|NSQvsAZP=0 zwdO*@m^**<_dSwwh_QB@$qzx^>^pqQ@{4?#6i51DNX`~+@~|yUELNgz+cH?^e&&kn zAGxktw}169QFirExL?|gp+FNO7p){BdCsF?K_$+Xg+;C(%yo-(9SeR4HhzCF{_TOC z*vIKbitFskYg9|S#DLw1iea4vZR`Y|j;#s9FABKXxpyfaB_ zp-ia5(AliP$|dcNZB6`vt<6A5vJ70rn9R=VJINh&GgMw5%?RvSTrk%8EE^pHoTzV) zr|GELFtE}ooY@#&$G1A!rRL7}h~E50msB|OgS-!fMNoBuFE4r!g^Qv{t{)`gRp7Jp5)GAn@b_E(CI=sA=c-n>lyThzZ+tVV!-E0Hv7F_{ z)9>Vnr^>8G`=mAr&T8FE8W-t!E$|^WSO4cNsL@- zZ!7HIyn=O~x{~~e(YTh?1^xEK*MZ?>74c*NVi!icE-34d`UGX=JL37zo(!`;?qTEA zxuS84YQ6U#%u|9j3C(ADD`)8{%<6a%RGTo?HRCW#`6j*0ZhTSZe%fmPMdA%!i1#o} z-G8j8`HyF#{yHheWR0=1#ppN&cyJN+^VG!*KRiofdy@JNcaJ)Zp28pT8g9opb+jPf-+lwQAI34A80zW2Xx{NLO2*A?`?i{gKung6X6e<`N_`zZW> z=26ID+)BbTw$3V^H63u9wKa5O%Al=vwKw&v`gj~olkWreDn|-~k{AVSi@tK_NY_&n zZHCQM@zTc^bADmP_W79;PGGNyI8Um30zOC z!IgG`=-isr#ovI9inI#!;n&VN#-KftUwpQ>=MeStLrSPQ7uVxZm=8XSd};kughYFm zsv@wN%SQB<(Pwg2VH$UQZ9Q3@-`L2UPiJak_=n_oz8l>f*icTKCi&9tsVL0%>|F@Q zX09;Nx4|Q-272k&fJ%Lyjh3^oqyH;si+hp}mi9CFTqsUi=Zo6265oJm!IHQJIFn9! zMR@hW==Sb%=>0v+AKE|I)vmBVW%_tyKJ_P+JOy{nKR#6cPsc9)<`{a7a>zdg%NW+g z3omlE6~Nz4Km?U)^H$cS>CftQhQ)tY=z74uo-)~ZXDURL^^);Nnp@qbCw!LPE+%@9 z%pR<{#{Jx%yBabKxd0O=QAw8l6?FcGubXT?W9p}>+~&@O4rO9Gu=KK;S(L!I!(k0xohX-k8n z_G(^%t@vFVo(Se55wGRUDg^{9>;+M0UI#S>Gg)lDJGdtnkk-2`J+gY*!{z|(F!N>R z@0T1|Z69^>n+o3P8vmNQDcW{B588^nLSn+x4(jZlCuVk=#$2WTH2bjWkAvAry86~G z)j`oL_H=CJ@vWyHuyY|TiFS(BB9DBFzx+k^;XB7fVhUIkCW9>f6DB>bpu6ldTjf~e zRjHR2o#gH12LsBd?0*$MNxS>VC2K^=@&2cxZOfye905-@VCdjK6Ak9L_1eS3@AMW- zqNXuS;bVGRnlQiDEqfnHx-dp+vt7q2cKB8J6VMbNwjxU+CgcLq>vFx9<^F^4>iRx- zaPD|_6W}P@I44cHSJ8nW-t=h8u^Olc1os{tAzZ1IeQu@b&JtC|Ba@UKAD*Rpp0it2 z%g|^D?g1P<*Q(;y4Y(aCEXMxPCp;=66A|1}#xfj>AQ&Jp(v-_{=MRuEbKmB+`Q|~< zpzA7}#G^>tFP#@>m!!BmVJ9pGx)nk@2#tXhL@W=o0pr+d{er?%%6I$ zef#>%n~(Qh!o?d)OsWjY@+O9#E^{jqfA{)>Ab7(RvGA?eRUsr}LXue(ARh7Z@U)HhuM zz2&q&ynS#^U4HrA$*Xl$_7)x@1ee~X_$6>RT!qzNQ+@oSsrjz-_Ej|6o9^Oc{&*^i zdpG9y=qY^0DSr|p=K+lyl+~ruwMCR@v%u|Ym|^m)Y1~`KHDme~<-O!T#8fFLQ+`~d zEL#}5KvZkRYtg)E63tsn(k@OG%&}vsZ1})S$yEIFk)AT(>Bf1DG8xRbpw0%RbLPv& z+k*g%Se3(+Kp?y=sQJRwPG@(C%Z0@_kV5_%rNIALC1_rU^ntD-ogq<)iywjT6`@L; z;X21u4Gv$c#k^gMP%+<4I@(6RPxUR`l{csjRbKgTODe!dozN<4o;!daKliEDriys7 zIhg6gjB}u4Wc*vnZa;+=s9fLQ6is%i@5+bfyBJ!l}5gJxfVel_87z&jGwR?#L3~k~kmi+Qph_EGlLx7<(y= zaFD^`+Yx&80i8L{-$)e%CR&kceNJz?B0aP|$fEfY@!b969u3#alWJnDVwcfUYTANx~* z+RpV3P1Qf_@o83fN!PTydiM!&L}u=pe5w3Nbnk;o4K~H_xX^nWl>B6w^`#NRZX-i8 zOY%b^^2)||Ub}q5yZrIwgpw(e=Y|&?7wvCG4$lz+%r5c(tpAL8;{cu%{FAIjM)gBh zNxvy@!hq9io4^9}!ca91odLvhP9L7Lx9#X{rK&yKfIL#M(+dv&T(tE+y~{dH;g4;by^<-N31V2=)4Um> zX(b=^I1m^VD*Sq2*${J@bag=LmR6TZVJ7(i*&oKu`ooXa>zCEKB6Zy(y1sMwE}NhzQ4usB zX}Drs8P!XRE9#lV#_qn!wzqVy*wrEZyzLF8oYV0P>N1BzB0qdWi z3B!YH+VK~~oz(@_V)EXvypR6u{dxQyLS0zyNYEex5{|knCW|LpTCgn`oNv4u9QtLn zO>4@FOk=!^Fie!5dDe71pilgxN%%L#{lOGK%WD8gL0u06AwOG|UI0sb1+-6w6R#lw zOZcm}inGg#`eaH?^0i>w#65|68bZqx018?CP4P+xaAjwb_{&ZKpvZr26`B(OcXjjh zE&TtV*GO1wS7EFP(x=jd$3Jl8@xuB(sp9#eM{Y@Gx+D{fLwc1mu%62+O6gz z)QUavAxR3~70Vh0lgM9}Yk>8yrNAAc_Ih8myMA!Lc<|-xZwlj1BNL}i`4f1iGToG8 zJ&*AGw?iS?n1S-3o${KvOp?w?TKIeppI*a64Ty354tnbT5pxgWnp@JRG>0FCdK5ww z4%Ing%JTKSB7J5byCb?C-Zu=lkCe$N)Lg5dEo&4gtzCxFTags;5s|J~2!j(^cQ(hL ztyPZT)!ih@0ulZ2WM7yoVdC0#R%Tt3f@@e3a-N$#=i#PMr~di)Qu*=l@_J0osH=%! zt*e7Gu*3eFIwA@!9U-e9@kO3O?*I`1`#=Oh!Jh{F7t_PxYZjw$94!Qw_b=55QnSx^ z46GXxEffe4_}hx5QlWXwI{o_p@ryjWy2e8jvB4=^Z9}UR8~X53O*QY*j~;aXvw5FR zoxDh0^-h~(KJyj!FYAEtd)zZ&H`CM%y>mhxNE0T|kk)mZSuNoC!G(FhE#NQts0gmd&E#NjG{^e0fO8>Xd zOK)^O36gG*q8a-Tw?Eg>xKv~QSn4Le=b<}Q(x6gIxIW$T&vzaeot&$kt4yE1-7PUO z3KX`rE+go>$EEGJlP`{y&F()MkydizTVBpYUsFFxz9Ukg$#{O?R(>;64p+NOhpy3| zwUD4~=&|##q`;CSg{u2a@dQJ(GuS(ZR|+&?LK8;)}k&;$z-ciD5b=VEUXpyu^jOwE0o*G=5g%1{iX|VX64*ZR$L$cn==}R>(X+eo2W>0!0ipP%x#r;2PU;I*GsRmm2a}0hDUb z@+P~UuXoN(+|MK}_LfyVLz2wS2Rf^U87Nr>DH~O zdqAT16ZvxIGsTLQ@mC5T^n;-SnC8{{(s)k;(<8H$p>+t&H@0}ytS74f${l2orwLf z$5Sn~Nca;xB0vl^2=04@f*%FnCEckFc=%LzpC9f|73VReH?eNdc0ObA$s4cbm6pb> ztTTo<8PFFK<6YNWE63A&BwKuLTCNZgisM-{8D^n$?HkZm^80QNYP!<#qMyqBQM<42 z0nMwOp~$^0ZcQ~%tU<|?dZTY6#?7rDp*!rOwsq=fOIt+78vKHaq|+QbSWXu3*cvKy zQH1@bcs723d7W|Ffl7Xm(jGkjmXZ3(#hXpARiV-pl=0GWo`8SPbJX$Ks<`N7F|lK- zh9Id-yig}&!T?rTUlS+vL(AiNbq|`U*hTS+X^AflzN4&8q@9g7(e}kXmYImYyA^h{ z@0)B21cEefV+F z#8^_lN+l;;?56gkW**3SHaDf0y*`wiqwe45KBM9kXUn=V>FF0&>nOq6jfl!sL>-nX z%_G}6gI{V7i(3vZnvDAh)FuRslX|Wu4HYY_-Q)F=n%(2#xe%|g;xjEEoVag6h5}98Y+|SI+*{#h=C%FP#p73HLG6H(Bg{N)W9U72Bsad*6 zkk9iPT>E0}M{FYdpMh=B^CeAu*iCbl3GUKqG2BdhsE+!L`J|p7(|5`u=3nJHCaGPH zNKm3NLyx$lYBEDy$(&W(2q{jVIb=++JRYyv0lIVmi9iLC>z+>b6m&X! zU9n*ew@)f?u)ZI~S)s7|Dp~RCw2WTux!fnF4R5%Q%WJEgt!%80VFSYR*Vek4ik|Jl zH%qu9x9RaT?JCm4Bw>8sf+}tR87*|ZmX**DW}_h>#$rHl=eD?~^oo5@@~4k{5KVNu zafki;M5p!AYQV9?tZsRqv(>Of&!~MU6H{uw+ad{jGNQt`W{Ns$2p%jc*+@A{(HY%_ zqihD&6yJVGr4H4zP^Eq=+S9WaSl?`Rv4*YF)-XumEHE(pm$le8LuVy{#;t(C0^o^x zhEuN&5g=ya7=q>6C2n3S-|~5|BUr5=Plt~?>Yn{|S9Q?8!U4g!{C!H1jSDJc8{L_I z20FIhNT3Zj>(MA%UMYJy-lUJWYPaRUsi${P*N)f3i$2=3oI4f4Y+r5YEl90>=J{%< zz+e9P=N2IrF-{k$_}%`k>V5h3g;m46f={mXzyw#jwLd5M4KxZw1m=OZHUWa%_#1Se zfFRQVjl?>6poBDXI;4uD>9A44A|iKvJQ&wc29uT^5?-GgcBtJ*dHU7oez#ZrB9foU zRs$lQYi23EDc`$6-fXBLAb&2G-CJ176GY^x*b*ynE~`H{ie7=AV0T$;h*@16t8vdq zPYqIIy1rfL?YtJuqCqo8am%1Hc+w=h8>G(^(LQFp*7mN9r*3ENaMd@6?YUtY``DX+ zr=PYGtIBULG}fg}u1NA5@oC=ViOYZdnrOC?l=9jzVGU0qXl?yCKktvl>foI;vU1pz;3xtk&bWcSw z7Ur-=#JWPFnio+6V-zhmV*dW>wVtWF}BeFypmelRIn*Sa=+EcNXn&=JB#l z(%slufn=(zJ+zFD>0eAbSPjJ-`4VyuNPS(32O2!pm#VGI6>|Q zGjIulHkpsAVnVbKUF~A{sLWSdIIPibz0pEP3gNOT6{@rq9q#_!Az=`TwUGl-RM(O zn*gK+|HVN4Dm9SQz+$2a1cFj{f4JoIpPwwD>P9*yqVpCax8(q@Oh=Na|KU5}XrN@o z0SpxrQa`n>D9qG8sBLuD`1Nb1=KHcHV6!Uy@8TQ({EG&YH3O{efzKO;qH1EfCGKMDP4(QeJw3<1^ol|b8*6(=$lw?(q98F;9j9sdDd#Zl~J*+fB;FThQ!I1m>u?wk393uEJDDoAepdMg-O?y>#o}N~&)B zV3}t6mP8U74OK$_8~F!g4q&mQA2-MPo5C^iwcVmzTd;)ML>`Xt@!aN#ZDT z5n}s}^3bf^+)(+oj%<272sYBv0^rm)Yb8ZOAmJ83!^wk+vH;_eAx@3}y`fm;*@YD5^p#t-on%Ig3NAPc;Q;nW?o|*k=bJ)Zvx_ym2nskX@k;#vA3_sYt(Sajv!_}(!Sho zF!mraPBw?!NGatoH{Vr&Lo;wZEJhIwwQ}Ik&C|+)WcS^@Sg6!^lOy)F(!x;aA-Df0 zftL8Eu&C%C4isVrWHA#s$06P%FRZ<{=s_9e4;X`Px+xYso5paIYOz7eYC`nbAQWIs zibw4bkiW}m2g*ws*~zV_wH$CqGGL{uL%&1>PnJSxAIFuVAA3qKTdZ3^SA#0Xb|Ef{ ziJv`H3}OpHV+{KWlXZ?2jP3Vyb(S9<{TUXk*#wlw$E!dj$!l4_SE+O@#NY>X&k(@E zyO4(PzcCUqv4`&%BirJ>W60WK9%r!hl_w!%fJc}b7#L7=ho4x`zg(T$lPCcWdZS63 z{(#~|z?PYeg|C;qS7ATPBs2O){%E+pjf$|3Ae`#dW?B_{6fv%VwJm;#8!7NFx+|CF zMrgh&i+qzU@b$km#89>2suuyg3k?P!{4aWH6Mm%%kxWYUfDRJSGRx&4(p@*>^k z8X7My(uBS2^ukKT7`rrF%M@BFE#eINztyZS6!$Io2)KJl*VegK&#qe; z|A3)3mG_-ThQ`1MK^}Q?XU?L6_59TCQ9v%=0+$}20=cVC zM5E~o0t~Hp6`VO7fkKt9H*lGtws{L|*7kMFU=X}}biT_SuqnD`=GCe2>>FMOZc2U% zGe4;Ie45{-dkyaj*qZYv0WrR<r9v9v*}!rx-uuFM8!xA$>`zkj`Btu^z>@ZN^{b&6dpSL4XSus)1YNhNR( z0hlFT9Rh62{-hPigB^7JU5;qH6Y(a%6m*+TI~#qXaIW zv1#238X?b&=KRu0Z9yS_)M1JV@w8PUkXkcOufy#*;3{qpj`l!-J9xy&g;=bTXWyZV zbacJwyb%B?kwD0wnZ&^Lpq0(k@5^$7i5ic+NVQXoxA!g)T))cMNv!ID5KH-+lvM(Vg7D|Z;L$rdUOLJx60Sc^tZ&{k~H&( zDi50Q{7niS&uxqMp@@tIrPUykDokJLikI@xHkHX1YFP`(%+b2;Dy<(?0gY#5YR}Y;CoSc~fT8uq& zRevrnA>^QO6Q+a=%I?KOvC=smI@Fq_)ljD~S5~w26sJqDjFi)2(9)~1#joF%qK+hK zo!zA@wU!Pvn;?=HWzM3`$v5u3d!z9!dX_xrn_AI9;!3_x@Zzt@Z0P1Yr}uSOqIR=4 zEykLq`SXh;ulVXneeb#;cdRzv*Kd`!NmYXo_ zyH9S)ngM67!0)HLxPrYD^n};3Za}I!=V5kguuECGQO)6ct8zbLk)Fd}d|{yMfRG}8 zP-(dlOSA1Ly4X8;%o{<)b3R{NRpt_E& z{IS<4Cl5Al7c*@iQvD6mM}_*MZ)0hPPQgm)Whz%P%{)dXpfY2G@yMw}-&5rgyL%FQ|L^3DW#LbLU%_2~UD0 zj(+!=QN0lVEo*TpFDc>JYM!UIe1G}%OF{DB{N6h)dUB^Q@{CIqF3td;j?_rP`}PM% zuYnTjdI>;X1A5na`VjS!ReJQlWs1pm%KVYfe6-&siFb zU$98ggR|jO9G7XKznF!5->H4Lq08?v?+*~a>i4_&(y(`rR#Rt#A(SGPyv^WanC zTiM_;xN{+p5s|}`R!Ot@XdXMMKC1V;2}MTFcnog@GT9`4=VHsJ<#q}0SgfdNiJ5VE z(NLahkk~kPawLA0+9ch*%*Z0P#3g9DXahpg%1#{ZnA(jd^T8iQjSN}glYe+-$xP@9 znLfy<&6PVgZFP2k+#Z^kKoy$3V&eQrNRE?fR$l#dNc+Xfblvfm$D;6P=5y7QVx^$$ zTR5Eu#HLdJeQZn!1u;6`-#*U$o}PBU5$TpL2^v zX9lnc>p6-t=e%FDUVR`{6l5Y(R#3YY7#@KGV>9+H}I{)vas5r<%l$N>jtMlwJ#&L@~0qu&p4u-h?*CZ~Nibr;^=%AMVWa1{p3D zHAk=)!OftfWcEeCE8j4_Jbyqg52EU(8s#DCFH%j&1jjh|wZ0&Eou+(a+s(9@XGLcA zG0r_#ncN)$4{*xC8h2gYdb)CvkWw4&(FD34X5gL<%bz zJ6c*HzL!b)9Iye{{PM`eU0AUY6v$hZ>_foZ4(DkhTb@;9pIuz`FVx8)Pn9T8lmP9F zpFVq3AN;{x$S7rgJnAhUkp-s;l{8IaMAuXc^bRj5%140r&=ow%iu5@e_2rhm(fPZrpGPWcG20^*SjnB z7fo|4!~Ntybl)hZngKaPS%$S0f9&)ifm;g9|C2&A+4peyi-Qa89nPy@93e) zBSJWk5DGa@d`;eMw-T>S5AY6ZP~5GMGo?>sBVSD#41CAR`R?M0(_wBWo05gE&?UpX z5j``g3wZw6WH&QKmPTltrG>j?&PhIANB1R9i`kD*92 zdupn>`g{;`MgW=)TvgwmXB*PeF)(Q>P7O2&_Y&sqrh5bC?NxKe$Ju>uB1_Po$wS>| zz?jw%2IU6736U^e#9_X&->Gn`li2lIy@=1sTvYC^iBs3IjlXmlX`W3&ANiU=>nE-T z>rd{N?FOv{rxwiCekwGZZz-)E@S=Y5Jk1@?{M7gBma2PjR|rgxd(idL+#q_#U(cfK z^T7QZ_c#T5;*W360UxHXmv_lgPKTc2$85l6s|WmJy$w)isf>e7LV6&3PJny|=*1fW zmG0(NHuWe$jnz)Owk`%|=2MRJ` zTOB`>5`c#}jdH$tsXPW4d@ploRSzGOJl=LLeHJ%0b+P-J;QI%k1-C#T`h1Y--^^@_Y_$>mNjfq6YkWZeu-0-b`PWZCmCY|*CLjt3m5oqxoC(!)+ z(Tt&;c7hBvy&)RpjCxiCXB|T)tR|=g|?=A#K(B;ps1s;N#(Yl(pLGye|XG{-P1Q@S+KmbD8*gbc>rG zN609d<|MC>M-(D|gLYHTJ-uN3x|IbkO>~|L1(2l(%sdSs#*x>AOSwpJ@HKd!6MlTt zkXChNX}|X`3QA?+RPrUS1qC+l)pl@oYLCkWd+Ygn{KFs$?h*M_;NL+ieJ-b(d7g6u! zKIU06*sa;l5=WkgKO-9LBmzDwHRun@eWdSB2MJQv5N+@ri{h_+K71lV79lJS0CPEq zm;L-eT&I3H340KJJK%q0ZF2BPByR37}}(BgoMC|?5PY2Qz6@<64^ z{JE!8`lq&5aJ3fV%GC)_gMbi+7b>L6}V^=!@ydjfl^~%wtg8YTfnlYwD)|-Y{ zU+zgi1o0wL|IW8YQ(GHwR94Oqa@7kfr}EY&Zc$?#wyDFsdLhRiC*G#EqKliW19$CC zvD`yjvsL2e!=IyUIX}7r?{Aa8Uka(0}7VKbg)-%QfA%V%Oaj z9PZiN?Vg)<>KCb<5&9-oGI4j_XW#w{aktBNPVOv2T8C+Bo94?}E37#4P3rZ>IXD(t z2=yV}KPBo;m)B7=eo9~GDl@MpL=i7ve!{%nCYRLHP?pPeHpP%GSxjQAnAVCZ1v zDr3e66ZYGKsPyf-GW7ucJf^+1_S_<=!drK5X{qlrmo{WiZ`uZ~b|~mjt>v88&c}6A znZsh0gT@^>rFH?~_nT7rdb=ri5QqOF)Alo)2Pu>}7pomn3e(6OmHlk~k~Ew8++X%X zb1{otOrJmbxEhkm6psAR`n|yF+8-Zl1f|z=0|9+jzrf(}rM+H{V8v(;-iE1Z!q+tr z5w>z1Izu(CqBUgz}+Wz}7vUuA&q4qoS^!j}l*^V<~%Xe>mMAb|`%ni4cM%9rR>R*_AYYifZICG$CHE*_P%mC+y~Gc;qq!8)#YevkjcOuk(@7%DuXf5!qWzrm(0!s` zkO`0XbHd-N<%f*sT&<4fzr&p+oy6iLA)v-z1>*?G&H?yn zSVu+ZA9ntkDlLz0J(5KUc|kpO;}`Cq-gRgUB0VDJV%e^%wZvOxkKDSrBKKpj3WS)t z;YD8i((CvLTOn{-wpmynoy_0I#qQInP1k1(xjjQe}iP~Fqzq4iVW=9x)EdW+@^Xhdhm{t zSC3h6T>e56}S zx~6?m5&iyEyNItY`(+?=lssp@#;KlTfGJllOM9R=Q!Z<6DX*(z6*ss!IQ>oyFMvI2 zgJ}kdM`bfaTj0*h`#s9n#Zw-h7Tc=CWQVR?N>JlHOpfbb_R#HCNvF|vAfra4QZKu& zvKu5@fji~p-db68e;#g`7bbao)!BxJjI)^eMFB332ZZ(oEt}%almrzaeB-U_Nuk?q z`JLBd;i9)5t=|%FD1e3;l-LE#$s{{W~4DXA#BsQ$aH4HcW#K6C#!v& z%~{d%CjkbaU&bYn`*_b=9#T`A7emxO6X(0wBi>X;9*WAg2Np)!1%RcI<93GnWN1~Y5R4rl|?t%5z)#w z;ZNMcLvEnLPfRcYH}4tiXqrUYCO2Jaq6xHawWs+o7Uo1m+41LK`fBaBj#n>YD%gBY z--`qb%JGl5nZ@NLXBxlQNe#*p-|7N%4E-JH_7P%m3S>ZF%K(7f*z7!?Teb7-c=P)s zi<9(??=UP~P&ZXr>F za}gZ!i=ch&yRA$wbQt{l`&}B-3>HK-EWosB50od3R_i3{MAPS8o4D+|9FfX0qz5%b z;oN8iOZo0WC40TG7j@SxR7v{Nd7_9^H^*jMin!=+R@0sn!W;@rYzNu=oQaMl)BJ8%Ur?K|o<+RgfL!xN1W!siohj{vSmH|ffP!TpWOF4*j_!IcHjdqtSzfuj zQQO_!EkO(Bs0Ouk8!yN{bBlcuE*9d9nPyuZ$_fHB{f0T&)(+|*y3wB z39BK;9!nq%rQpahn|K+z+q2pL!94@ag*Z_R@;){^9}5F9u|V3Ami|Fz5j6;pm{uce2>|Eq z>}?~GoY(?VJ7qxh=E83P^ko$-V9zLgk9+bzVV3`9Yx6(1EK)2XYT`*5tI?>Pmjq@7U1V`ELctiy_5ja1CKVs4aS$P8`V{ZF2Bg|m>X$bIgMiRm8V|resSm@i@|wC#Wa-p)HRQodFvXP zH>HhtX>#?dmne`!tklxsf<)m(aFmQOiru+Y7)rd=?C2$Z(C8LHTg3WYxJcqo>WX#S zhu3l<0c_t0`xcv*ll$)4e%(M;s(WdpCtV66w>q&!n5m9>FVA&9K#J{cb#a``3%_&r z*&7);HPl&x;&5c2OBG$L*$efs-biRnarx14_LQ^s;yaIbmpv987OE7}1CoWPlQ6v1 zQ)c2-JWnZ*#Y9F9H*aI1CEsIJB^aP!|Azj!_WX8n*7mN|uJ%Bg@*d1u zaF8=;{mb#rfDSE*mz#^&hW!f|2KO#FmwXN8AllkS(8{sj8<8GLYu+77&2(WMf*Y}B zPH(cgaH10C2I_)#eNW7_PAi411J+byTe&~s6EOlL(Wc86ddS!4w~$*})?DU(lZwAV z3jIdhR#ABA2;0HDS=hM5@;mSCd{GRz?lx^QJqEL{ENtULo1upDth6xD!w| zzcr5M=$as$(09&HQ5vJ+ch5zRIN}`FosaGZ{GqOr5U{#J|7+>WMFOVtg9wk$S2?-% z+>P1owNexf7YwYCxRR@W%dcf9U_>fQ;r>YJf!57oQD^7tnlS&T@QUszCWBxQ)1FW9 zV2irpmtFpy`3%KL^+9dD*_2I*2QL**vGeWE@#-Y8RVeufplNe6jWF(+gzr`dAohlK z%Yi=y<>UvkX~WJ^ip#a#99g)wVTY6RKQ}t8Ly!acJEcBkzEN>$6x~LnOjdDk{_Omu z4($AahUvDge_v@a1)I?0XA<&0T(E%-!&vs8C35?`qp(IDx9o1lrBBlQ<*6#GZkNd4 z0X_K-CLq8?<8>A7n16B5L`vkIs+lYh=C-xj3N)t?4McWqh@*~@BNq=qz`g^@P zdIfeGRn6;Az<>lgK!ndzlllD>I_-_|Zat!!UA6qP81hj4=$nh2@88M>lzneo7n{3|r zi?Un7o55w9(W_8aCl&n`_Ogwc0PUvQ9_BHN>vJ=;tBQ7U0H;axKbvL$n~bu*Zql(C zb2H*vRwK6+pZj21#9RW~scLG*d8=c~7haJ`>UTKCS@#S$xv?*$|E|cBvYUh89_}_Z z0C6(UVQH?skDIjQ3Q4@7$3)1cqoY&6zErd!s1S|u^B9(nV6~2%YL+e}(S-eMu+Ae0 zh9IQvC`QXZZdX}(QoOoi3U-w#KP!IB_}}2c;p3+v{w#}#uAx_wBrYNc=7TiNE~{0C zOUc~Kd>4!^r|@CBv9)EWq(Yv@_9L5`yv0}dz}HAJ09TL4zi+>sa#A1j6df-p2d+Umvu_~b&T;@OC~oFSDaken@~G0ow~OP zc5Xdopy2xpIpGMp5cXfcK`%&(IB-O>q|-g4=*OeatBt_h7sp(cS6<4Sfr#{EY9^ox z1mjr9^)STAIm3;moZW*nAb=+l&qG#+L-P;^^&fmr?vs3h4;)pL|LFqLZj7>&1L_K< z>8lq1^w!9R%wE;of|Lt)NeZS2fNAw$$!41+u;sOo+;<*lB01rA z*raj=%q%vp87*r#TJ5YCt7Pd59qJTP1ASnca(DSvFdAYs0$1TLth-I3oFS6 zvbWL`?U5!VQ#=G1<1x*d=-pP;xh0m*cVSItdu#9YJLag)ir{xX{UQcw;N4!O3~9e|Sui$DdoB2_ckk|(+^~T) zeS=SOrjCGyDYHj$fpe5apChuvnsa3)tHB7_Kg9)p+k76c(gVIwgE;SBW%Ba{p8J!O zs&FLjTc?}q*moC{t+p^6pR*sA7ZaaYiBk$O<#YSF7;f8+ZZH4BOEd!wqwmD@aL3fq zFJFIZU=U(Hu?PC)p?$N#7(Vfrx9{j*UovyeO1N~^3E*1O{lxkzUA&%yXMVY&E~RNHsk z(qqcJ3a6_pIWp|!I5d)`43K@=`&m&Qy+~QH{*478?MsE~8z}%2e)H$aju^I-iV<`B zSHvYXj`RI@O+D59v^0)%40lSrq=T=9!f+ATx(-ou-PDDlx863NHuP&o9p&4VrvvCo z`+{`$K(x#LAg<0A4T~v@w4auMck4DY1aKCG!6y>V0SqJS5(9j*HJg#%IKI=G3R2d0Fa3!Q&I+J!86Blrfq2`QZzzZ0X+J}$UOsj*v(GFurM;X07nnOpjRoR9w;G|+ib$s_xGPjtZtHNppx>Y+FFZ;gseEbkRd;aYZ z+W`oqu(yTU(9kaubL=-7O2d+cEglJ(z(vLF!ap}Sb{vNIYPK}hPCFB|aEsyle1|&_MVhGh8@p)D0z_5cYUb~uH`#wAw!Ifj(3ci@%${x?F)jK zT;4Ukr>PPe+k#~xbLl=+d*W~?5emcWbkO8+tb834DP#!@#=d?c2~v~0zI}Z^bS?0e z*$b2NA9R;m@@jOd?)pQbgxt%OumN}12RsLrfXyaNd5-iLHXjGsX2G_IHiH)-Y(yut zYBr>5-fI2lcXZv;MOu`^8;*!`FI#lZx%*o&NOr3HGA?3b$#pt{?bH^IsVOcr7dQT> z{2^*;q?3L%LnG5}cx@)DP7`Wy2xJ{vQI3;2;L26Ff(9>prP=1u??Vn-!|!%Xo>dmD z1q;^+x9d=b8#qZ#wa$+y!(#WhWUsAC{PF2b40dqGwVi&Q{Mg*4dAx#c56>hNoY;G zc<;EkvJr#vl_|+ph#QSWOy7W#j;Bw}+%=?6-FAtsc8 zv!6nvMWbTkPDWHzhzsmDC_j(`b&Y&^gPR8@{)WVYQRshcgBw50Yue58nLl?&!d=%~ z%#;3}{GMjMtm$b;z%FSDOh!LhxFcGHIQbOcgpSKZ zztt2)YKmtAlqch+i767Y?){0jLiFB@8xv+f;?kK})aw`*?x6dYjstGdr_-?3hDtORynUbnQ?#+0_?)i--fb4px5VbvBM;$Pm z~4UIYuzbIf2-I-j2kXY8+$9r&N%zT6Nn74I%;hxTX{hQWYTKUtN1Z6 zV+f0XRR0rn*w?D2T9hgbR{x6g&eO*W6fs^kK+dJM$&c2iP7$kX>e>^ITj;|xo~8&Q zQnWCs|Hjalz5;MYx?#M{aulHgu{3?zj~K8 zQuJev9T#~yxS`DZ6dH}7QR}qi(UCX$;vMVtpq?dAYeW3X_d9!;Ue()HnXiijy6Hn~ zUY>06d_M-X@2x%kxOm3jppp;iIVOeM4n`I=*H6kf2P}6RT^t)RHPA2!M+jd{b_CwI zYl=~st0v2?AwtcV!76#ZzVapHo#OU)rq!{2JaJRMTU#+Y#C(OF@})~8IvK0gc`=jk0UnMKy0X3Ac} zS?RtFD2Ki&-rQJZlRK5`Z>(~it@d3@tk|~V{OP?*;~lU2cJb6 zLr>(*h*ko>LGXCM06Db2d;T|Q>Ao=;rTH85xF96aco1A3_OF{QZt+tGVDp?u0z4fR zvcLp+#P8!pz*XmVEamnFLJ)LBo$xtOP7D5-M@;$BSruxDpiGVUg(?ALVMUuO^(gYg zan$zwnUKH*d-A4u>PqYe>FM!pK+832`y{e|B z32}q`!ICdC8PN>}neR*%1bJz)q(I?xSF>_8oWM^QK-2tIduglu>8 zkv!1-8#I)LSP&NM zi>Z58MkOW`&53R0d27#x#bS4NkOMg*b97HjBECCyms&t;Qa0wTxrRnIju#JK{;O=3 z5)c!*fRCkeq5-=d{|4$r`+x$x1&}{hE$71pU@SCiw?`UofkuJ(|KkAQzv+Ab%|rfAsH+iMUcg}^ zaY%9(jQ)&K_^4_t-sbO!ga6xlRS`@uIQyu~2M?&h15&Z8AyiHP-ZrBnaN4m}^9!3~ zF!v?iOFSwKx0|Atweg{CAL;%7k(Z$c9jA8YR3YM8Z;+>_SbCt5EP<9oFO|V-=M`@h zc1S9glxTQ}?05Gunb<`NJw6+6SSSJlX2<{jrJ%salNhYY`(0n*fy87FKdD;sO=4B) z*?VqP!FL6ds^9!6&#uQonzrwO?f))Z^8aYA0$e?c^M~678Fn6D<_F)In|1CF2xkhZ zeoV310`b38oF@dI<&v30MnkT&;8E?tdOQ1Cqcv<&B{?-yO4V=WZ$f5Wi}fBpQ!AId z{*q$-@6Jppjwb=_xF{|=3_BE$I(HBpGF2(0Yh_DOK6%2J7$z0(-%o!ZlUkv>UKOeC}UEg;S=0j{S|VI=u3)a5KR z6)03t-Ov?;ETU5ni#nkOwmpD1Y7dat(l+5B#7OjjV?-Jl zM%(!SdVWE2TmNs+>iptgP-9UR#y}ASJ?4Xu??EuBAyh!wl>Qq;lxPGh_awx#<=>$F zU3@A)Cwn9&{2TNUNlM58DljmvA4Li5t+Ws%HcP;T0{Q9*9MuuAAD}JXOl1O|#F(tE z^BeSzo@h7!3J_uNqsX-7DB?2s*IfkZlw|i80^e0lU_ibGJS%S@@D-*aN$h_64LV@} zwtp9lp(M}vElw0Um@M%d1UQN#^SclS!1E>mqVXllzd5IB1Ule9DgOqs0SwW9p3#4v z(SLqM|M{)|=Y915j3cI@5q#)i}%>T~Qt1hOWaxl-0c?*{pffXgHmF z*m{`u(sr)J(OfI~X=G7!7!36~g3?xPbwpN1eJJfKR2)}h+)9bM?OUy9UK>(i92FAv zZ#CWjd%(&+ZBhPpH%;y@h@3~wZU7>OWB!bBd}9f^C)oik>5#P1(BB~ch+^fhXK{$7 zaWb#~{wf1j&d*^Xgy8pE-v5B~gDM@)2$V{fLx!18meK9>tOYGB5#s0LJ{wIqC^pb( zT;Y2(7sC`)xx@)DnGPEN0c#(Htl+b-aO@>Y#dm;O`_4U@KUI5nz`2RSxWxt8Fg~1G z`}U=SEOyv>4fCs=l?EHd`nhiPw9Hk9jh=jvpVU9_j{kFkDLu@q8uy5{^r9Wl^>~$g zoTv`OdH5Yb0s_EH8T@TJ(wU?AW=4BmK2CLxj;waI&dgpu3OeF(x zIK`S<_!4zxAb=Ri6^^M4OIZDt$s{)=WE@4_YFW!)%W74NhTkF%VQ2joEuu@E{3QIL zovAE@bVbjrOSunKZde^p{gI)6)wb{=a#J27+ay%|acv;H(27H4q#XLa%5aMYxi63gd?1t_Rh z*nipxzm)D5?el& zHc#eWebKT(xDcR1Se^(iw65^8(~_&i3AQfBpKaRtvMeTQsM)s|X&okQRc`JRVQ*WKE0*4y znB#7~i_PBb+chd3HAb5&mHUu+fI%7LqD5Ki+h+3uN3O9RUm@fuQ{^g))5I=b}cxRjR06nW4Yz+!1ss{WD# zqE|M}^-HLS^$T{IatFE=@T}BB^iiR2CN|UyTA%#Di^n%JElfNn>0)DLpCmk*dvBp# z!h?MB=Cm^iTZxSM6h2Zta%dl7`tC(qpBqI)*28KS{*mB}X}pNPS63@5J~(&YA8J)O ze*JCjW5OuauU~_#bOwtP#o;$o_!Y-&^O|EMMRLK3V+IumcG^0IKhhibcK0L>$OyJ(v^+L>tjzP1bv>80;u+AX37ojr{d|$-u9m5S zQXt__N!~&!9bKj!rl>fxp_VP2yvagHJ%STN*T^EcIn-zHHAnXGvhEbhL|^b#P}+

}UUJ zwdU3!(-gUzjd~-pDLt2lhI)>?EQW5kPs9|Be#xlXTnaKE8Y}c~=yhVhR>evS&ek`H z8lAQJ-i@DBAwOn(>M^x!9~KX=k2?ht@a|8=}aRbM!9Bjth3KFNwC{$BigNMEhO|!@0a0)E!F2{v76eZ z&o7KN>uG2qEp-Jt+*mf0d5Jmr^E)-Wl8;Brx2tDm#`>C=A3fj~Qb_27?B?oFzD*4uti<#NW)oM0a#2>l7K2uZ`O7`{$+_GLDS(b~!t zaB@;Qm^d5NS#p;L@@wN?g-GD)$3m_H6yuKU8?Bb|SguE2Qgoe{H#!r$u^s0qUte5< z4Hk(Q>09MU8_Q`;&U*)kTuV{!33ws?Yhf|G^Ji$Lg)$p&cC159P3@-lk=oC^u=5{Y zQt@Vw9~3S+LLb5#7Tsd9Zv(Q=VCvnr#$X>C1lA1) zfD>`!&WZGIlP&f#H|`5}-|=m3(zJI|JK73Rivz@cp_rU-*JeaM+-Y1oI!i>fYc}hE za%*4h9jg%Yvu5tB(<>fUGU7C`-$wjD7<+n*7gXD?-DURafNRBt>qPcpG^E8@`{?JQ zYg2u-PR`a#vd0(VuhFymtEl4dy}ak6DOhfH-IM3VQ`7r96yFDeegWz!(@)eBbrMfV z9SZ%3ihUs>&HQAaF3LMQekHbXX7B%J{HFi2e)+$b<5ZXMi{!_3u>cu_G6>k9q_~I2F8@iRJaiGs<*$j! zd2CvY-zLkt(v#ua} z_q1uLYajh*>@%631sr~)8T$v`(0@JyhGPdrX&0GYjfMa{j$gcW??1ik##m}3l-|`3 zZQ~5R$9?})kle3Kf0hsec7o6=vSCHtOsmIg-qyUeALo6rvCHZ4#xT|pbcxz_QN&Y+ z!|x;v~YTBy!nQoTgek0TIHXZCB5Lh+sI)j zyouVI{x1_zM@2Q_C`5OUvRaP*P<_*#{!PszlP`Th!@S<{OAWxM-G>-?Tk)Z4h*_r( z{pocoZU##}80m@RJdt{-&ek(#4kj}B7^nBFc-wHW<~@Eq*LTyIKc4u$Pqufa3$PC5>c$v%`sZuXeG#yi zC5rwAspPbaB$&*M6xa#wtG;nf4d|6-LMG2ARw<&f@m+|p?LvP6U83Wc8JV($j1PJf zw?u6t)twkGs9N9lw0W{x1t|+KA$45pRW(uD)pwebE<8X>onGYKpH3a`nBv zMUSRjnxqoR`^Wjeju6pPEh#7**a|H*sn~C+qRMi+u2{|p{0aH_N3m+HrTn|e?$xCk z@SBoUyf4QkOEdS9zB9v7X+V|L%Zp1rdkwfW4k*l){{{9NM+cZY5~4YP%8l>%0~FA- zlHLtn0~pKzBUHuW_=ps}0Y@F=14%Lz8)(kp7-(}}jNOcG8cR%LxM+%u1C|r6I(-NgysXlim<*PP5vXIR-R5dkb z^FhPVV)d53?lOYGQ$x?lcW=FD4PZ()bAdbPBhry?%YEv7TGWi=EPeL~S5J%LQ!F32 zU$U@W1ts*n`o>*g8E!%=RovjEVVoGuo%)klE;1^HF+Gn_dk_JJ=yf&y+lj{2E(Ca8CVj53cq5_f znpHP{)WxJ*?YVY=wZre*zfiJC?Kz7odS**9Fx^sbIaCk4iskl+!9;xs;qKkgh!yh2 zPDd{%Wwe%+D5nN)Orum##8l>ZTm(iSs>Y2Hcw(t|*5`!Bug_;2J%0F-j)Ein&bJnAGUpqM4-WpDEV z`}RiHGDPXGE`*;?uya-4fCSaI|Ha;SMm5!L>jpuw(Tj9Yno^Y_orp*m5RhJ?0wN$? zdLUTnO}ccI5~-2i3BC8;A@oiH0s%t2>)U6aefR#(J??V%IOC2xzMmm0i?!ah-uIpJ zna_NlmqgzP{7#&PWmH5XjN?%8w3I2-DbaRiK8I4+_Hf}utGEqR5mlB(>m%cF1HDK9LT_T31_C7O95 zPuukx70&W>S1L~M#PGHQ{2VGD8MLLqvu~d=HJsF^B6jfJ%g(KaX}l2-7+{T%<-$E2 zmDVRdib&7EzT4Uc+_#KoW>iFyvpi1kT@AS;$8>gex%=)@j6wB5eqjc;<7)h~Yu{qj2RPAh6 zv)?sU=N8C+%H=vIj-^MBEj2JiBxHHWKmJ}-QWUj~;NKY)V2>k@HNSl_|53Oy<+!j% zPxpg|^+R%%tPC|{giL=*2X;#uo}%6I;+}AdrAQX-bg24dA;kaHkK*AkN=HwCi7%K9 z8y}s=huif3XU2wTjXb%t3B=2+r>au@!H|88T3rAgRogo!KU`lz&f38I@sCef7=D7H z#>nvXfP~c0;3A$GjR=yqK4UZVfHjk(KJ>7B6YAS9qkS@&(f{}(sc@q>F=(}xhtBI$ zew21r4YGgEixs{qQAoy#xs@%KA781voyF@ZhIm?L6j#f*80mMT{C?U6tUr2K z6P;alUB!XVD=n#L5BjXKvcYvm(Om+Pv!xTzVp$y0{KERSnw|a@a?UPg1^WD9{sAqIC^H4DT33|P_r9XZw?2Lr?lFX0n6#5cD`O49j?SbZKw@<2*wJsEU`O>DfFDk5*GH`v~uw$!C{A$R$m%ICrj`bs;<~Qzp-@sL& zoqFi~Fa-HSRzG0l<$ilxp$hF*qlCcIWVM&uyVcXWBPZ{hjKJHIu1oXu-HTU5uu!R2 zV}q|gik>$G-wo_@eRh`22e6O#oFLqR@HjOp-O-9$6#|I0k@F?7Hm6p;3<4&Gm=?9f zQRv5E-L9mS;$oj0>gveJ<|d#IP&AKUXwT4?UfNtBi+?2!F>OWJP;`HNgzZ^DsiN^J zF?*pcG=?l;G|W=U+qgV#cC>tJ#ibj=EwY$TV>E)E>Av)ZuFE8Wxjm zdG6SukJu8tWe|08sP?M%+QLRlLYhTaXfEWs-+S)Wa7U$VF_@?FcAmCGx!|POS*`OX99Y`8ZQgw=c$JSWIVWB4g9irC@w9WtO??8)tfErUF?n%bT0J89KPM zSfDK@a(%)(`d2?1HoNOBpKJL;xwy<&b+SUl0B^F>*g?8Q*bMJ61%Zv+pG4Z7|W01*Wgt*hR~1fnKotl2cIoRE7HfZ z2&Xs!Fzrb>k4|)hmRoFv4?w#lKAmN;y09GkeZ z`(AIJ72P(}vS0hdvHBag5agp$wW1%oWus-w553ib;6cm8;mvn&3O>>nnFv~6LG9S} z?&wS-pQIvGIvrFKhEX#Fgehszhb{elL31}p5- zCk2J6cFg%~DXBqU6g6GiMRy0Y7^s&t7&#p(am}pZd5JG_&3Tc&zftPl1;+x}o?K*| zoW^Tk`2rMw{9S*7GO`fI#03C!OZy0-drWUu{W2j>ub>yGP*?mrPyz==HzJH#yO!EVT2ug-- zHm)zLMWX8it+QrkjCS`eqi;9Gt2f*qQlbY95P$x=DV47yu&Bt@N-ZKN@e{09MF1_q zdI-tPtj7yLR-NuMRb9=ODo^Ey-Km{rlE+7DZHrfbI8M5%mTjCV=b4F>ov)0WR!BIN zgiBJ!F7=cSuR6idUwx%vgCm>s*qfSoFa!v2w9`=agP9QKGK#K9F z0@Bb3xE^o?S7De_aD{XFDJT+u2_UDCpRO|Ep$H@%1ZxV$0$_}iP9AK*AB;U`ovzY6 z2Pypo5yEgmJ}$sXW%cKQmU}Sj5zYe`@0?L2`e8cGH*#kNj`w2+iWWdSfoxR(K+NIY z1IWaze|vpON4o$?VQ+Vuj6OCVCxsaSRj;Mmf!m1aI#+yka-Az0tWX|`#G~l0nkKE z4KRCt08#;e8Nr)>4B#IF_{SUgS4+(Q{S;=ZXa&EA(3ipj@LZ#=kyG+IquX{Od5ML5 ztcnI>o1$UyVHGKE0ZdlJ1@4xfeTK%2s;SbHVF1vdbmTsdTDJ>qeZHc1FPtNFtO8hK ze1q_#yt*>znK(P>g1f|wT4&>n8*f8E$ADhxf6HV23o`e+LkLsRV$H}4P@y5K z0a+HTnksrX*~wEB7w=_P+<2ujc2#NOGf0D2&JRQgB6z?|^LKDGV9CbOIGCkBJ`b;z z`#fbagM}Lmz44&Xpr4t|JX0Dmj{4CH@-al2UL^zc!IaRwh0>|Fn&z6M-%L~6y#HYY z8u?d7>A&3Eg~IIW(2WY~Ocf&c9in%mYea1&c2d{gcA8G~Nj)JB!*?XH?#O+HiXuzS z^QCpOZR`1bY!=ldwzbJA8#29VlIWHmLK}q%DsCFe$I^^JY3OUu52sm(ZrafvZJTD7|7ZJEZ%>^nuZkQ2Ukfjs?tsua+e z6=%0$^}Tafuop^zzhS{c(FLK+OBJZgt}emJ-TyL4zee9}*dp-y3kih>G+4Tti<`M}c5-s|a*y>+&UTg_j?AF}F;`I}qwhe{d^l9)^W5|+Jbf9y zXRviAuPuSpomY(oO^7&MikciwiyKt*f2?OEtzb|_$E&yn6P-uQMY7-R`Ci3+3AJBK z@~W%+me~-3FUCo=S5ePh6E%3KJ#Yx{7u4i2EP>x%0p8dbNFGk9S<}CG(#5RYArsUm zcSy6X*F22Sc)DC_)84RhQZFdbI^3q6y!s(F9+6R(9<%8I_Al*chHoD(2?m_b2e`s^ z_{sW9zW2t$cfOabK%@1KKi`Gg^>^uv==Y&E7HJLjuW6GdiK!{&?(dD;$AgV}oeUpW z_7kR}Y?~+bR$|ZB&!ScP(;@C^4tAcQvV_>iCGcerF-3G;dxHbhj7rvIyUkE``fYul zdmpbub)O6KU~S01qKBcm)8PGk{gqODeZrDh>`SaiZnSer%9|R1h+Bn`(!;Q}>`flG z2u*tsn+?0($*u_D=kZoy*l03+&^mUvCa6u5@N3Le^}R!$rTxh=4ONXPk-h45PZPlp zjgIYfs0?XK$=K)^SL%dYbrN|JW}S20zTfuVz^)nD0?~&^|7DNpHaU{9Ks#$!q7dCB z2Tr+#{la|V&C5p!#dsFc?@nKIUoFO*A%&OkB^EqJ2Jx&K(C4+~uh5oh&N-|morv@i zE-0jjL1jp5&<^bk0$H2^pDay`6_+tt~6CHPVPJ{`nchA zzWLm?rJhACM?j=kO*yd7GCXqDm5@+InS)%VZ9%T^HUDFc$7w8C+~Q^%?U33d;aV>B zMMiiM$|)9&4W!O}n6aX-_Ga$9R9)0|a+U>--dm)tTk6g`fT6bx(7UPj!AG8lcg>?$ zI;H%=zBPbIon4X+(FR?gDg1%9iRH0s(~@7epuszJRRH^0}IDrv?&Wr|y-6k?2Iy^f*hTT>Wz z$(|jKC_e;0=taeOL_Q37e%(@%uCn!^)3+0yvf1H7`?30nzFwAh31aiivdq}fM*sqz z>tV(_dVef^>Xn`n8?j2rGjbw7rIc=6iW1|cWAVlVs&dl?s@`hy7;`fP9 zFPLODyv5#BXo%I+IQ=L)5=UIbX=2|3^7`fVK({~>n94@~RQg3<-$94pD|={luZtS= zHhp*+Ymv@d`o_%*%$?(Y^bUO^2xs_p9e)#TVJlFFGL>^>R5oU<1HJ1Kj!eG&;rc;o zM5bP18)f0G6rKU>!fphF%mqiQ6M}fxGy5I;NnGWqZpfjl7d~}7DNL+|t-%ntGhFQd zP%DFWZ>e^x`8HjaInMCm5~GyRR$xr;GqGos57?;MiNCC+nFqJ(ys>@~ovc@*r+RMh zHQ`I%xi4+u!t~NgHS4Ogw$}UA94V=+uC#>BVYmI6b?{3&MuP!gO6ybmnk+NxacdXn zUI%wV`CISru*E&Q+)TaWxq3o}stt$-=edIDx;F2f3mx%CL0r$cd(@u<`98ObP{ zQEP>#@9oy3we{_OC?6mL+|Bs0urt|p=jH_;C9To?+D3ust^t#6+ng&>hAJI8mDgR` z3p_pO^%_^c+axOBm;#Z`I_@-HF>%W%V$0 z`yjeaLGAs)}_LX)f_~Ctp_v8&G2-yDKAo3 z3!9x}REn?>-7_c-x*qLlyKa?lcp`c!Ly!L$lDGmZJ_^nURQI1UxYcEv zZB)ECMfKcp+q?c&g-*(V@V%L$ylNH`L%4G3?hl*xd2B{R(<+l3&xC4J4ykfpYYu-+ z_Ta`;VM6BS7}9F&O^xZp%hSCSCJ#yk7RYWIusAB?Y{zmmXlSxy%xu=0o&y92P7SE2 z(iItj3mRlX0hOa1~mcYqpzE-YI+uG6`Eq*{>b)P<^<6?K7!u=6fo)E2_!@e~&aA7Q#I zs>pp1USU^`ohSJBcX7NuhWunZ#OlMo{Rw@5=7oSymL^3BXVdrAb{f_~Z)4F{F3yJM zp?2Q34xf-LWtyy8vOMom=Wsu$uIoa$aEF*(reV)cj_-~MTkIXvTxAoNt|AFG(1F*d zq_TJhk>w3uNF3N(ZO^WnszLm5I&ng`curUbldFb3#QJuY=9~?#LSx6)N5qJur58cRIy>9)A%irn>oZU>VWv?WY+Yd95sTNp`CHw`<45VpMp*$e<`V?RN{tIl&0{gp7X^K`$$IPnSxH!`eSKqtE?|Bi(u74|GGnzCn86K5LL76Rv{%f`Nd@iQez&7L zas2PZXsiSrThKH`eV4a-bWM;XOtKK-f1hh3dT}WMQJ+Q`J6k|RnXmK1uy@ZHCTFr&$wae z=ncB$z|GxZS0;j^!x|%dZxQ1;#?Y$C)yKszw#Eh z&hFbp%Ya0J?NK^5coFgeSu)>37R*9g8nI-AKJs_*w%}0TV9KqeEdB|i%vOsU_;AZl z)(Fda#)ZvTf;9?6a+S-3G~2@ZLpW5J8hNfWK4zpoP=!FRIsPy-BY^5VI9do`IqW6S;16u`Cq3kTp}not=%^NvC&lm#YiGj1;)foCOr*cYPdQ+fkR`tTKS3-85Npfk~KT zdi-4=8_UiwpUF&C+1U+#?DS^zdk%fPlRHHkx$1z+w zUH6dp&K}Fu6)t0sOBSDUqbZ2p#2b!92$sOb+&k>Kz9V1~2Rxh2{54n23u=}HCf2;q zeWcXa@~s&?J9elo!aAOWS6&Yja%v#1{3V!9+yFyAL7a&HrdM&;3_z3vw-EXq0HqWT zlt#IrO2MrAYK!Lp9-Ysp(3l_-& z64IE8sxUx$;lErfX2v>_Q4!Y7r#NS_?WTLv*-&mOmmJzaD0UQ&O@(2Sk8ICl>hEkO zz4r1s4zB0JO5!)?_)eU924?pr)7)r@3RR?(mk&Ra1|A{M_=>YTIQt=BxeAoDYi$oi z)P@r;W=GnHSABPT8me_|pqqXo6VufqPIiY4C@rJa>s*vEqVS!Ii<)FMcQ+Z#3NY{% zu`w~n`|G>>E9AA#H+S>%FUtbA+VRVgCwm8by>}uL`7ZrP5O{K@JYFY-f&kzz>Tu=` zND2ox0yN?3?v;}aFC?|>G!ISF#ZYIBdpH?_&QV&tr&~B7dIlkxH`%Q258%K^ z**K}DDdh8V41S0r>y3WC5O#jJR0r}p{AX|Pzu6TOTwNV3?L=3WH?ET_4t2Aak3Q0h zZb|b4X}lr~#mZx9P>^;6$qcpz#TsS@Rmz;4>x64M5ox!t%&_TM@bcvOj6Wp3D}Uh% zHNnilf93oAZ^k3>8W=|nG}{FR)k{NZhPI(f-Wi?VmU#*^qZt(ZHG=UyU2;s{Z)$2> zbhj+cg3w%biV5}6+mc`PXlK^s@4(RIj+pD+Q$;5<3wrXu@kO$?B2jW4)A$IgHYAoD zz-OK7yU*Bx%9k8q4sCVNe+t+@0n1i0*|8x05%_n9Ms9~GofGw+o&^4J*q_Gkj{*7P z4f$g({Kq=-$6WYhF8oV#VGCwHPqbx5ab7De1Gh1g7=_B+!YUr^?79-ajLL8jC6eIt zeKG+n^w@(4rlR2fEw?r7?K@{r>&T|1#e3JS40RtXvXWev)MZ$CS8S(9mi(y9y`qyf zH*P8Y^KVJUfo3agLcRe>Bg?-z&tQ&i$2c_l-WpdgAJk_Bne#=iL zebIe;7QM6>=#L{_m|m_Q_yG%KxjBSW`ZD#ZfSRIRAT5%)_Bw(wl8xg7%@a82dBAPL z0h|JAhpM&uQWnaAa^s_oqz1y4Km0JYX*Xi=3Ux+Xb0v>5*a)=0HyL_lgmpjT_K?y; zgPZL%A$7xLK1S*X?%W+lIk_WA*$bR=LS{H$b1EqT}S`A4}9_Ax)^ zzaP~TchwVNbum|A*xP_IRJboEx^5KVZ&yq69Mw;1BNTgYHsSt(hB+wyqJ>KAGAX?1 zGgHv->AB$ft;ADxh`4X{ zpZ|d~lE(L@b@~;+xjlW+t<2KZ^Et{+U`ZFSRhV8=W4dh`Cq`Zq#ANaI0SQ(Ub3+YV zgC1@MQ~gk5%#$Lcw*F`nKXz;kUC)Rkyy%lzxIiY(@r2m$@4X2Byvx?#4iUi`nA%sy zgBW$`%kylHx4l5S+*Udav_Y| z*YY00)S4ez)xT+i{)5w;XVcBuM$&FBoo4&U!G-8!K?^bELqlcKz>@3B-XS1N`Xtqn z{V3RSF94t~ksUgO7{Mb}>c$F1Es^zazwFaK3Wvrj#y`AdE}*eZ>n2XWOgQs@UNHWA zJ%kHj>}`Eq3akNgiUL@S>lFHV(JuD0XUEf=l$?4u!cNF_e}WK0z(2l|o;i9Dxb@{f<{WV- zehR#A4F=}>w)!aSM=HR{oJhv2Gor}OLl_+J!A78?u3xZz{8rFNA8EM}>$csr@93ImXiYbs zzea3#Claa$WjsXg(48^v+xAY4Z5@>P$DL`~SKZ&5*ebQga|4Z11W<82-5|d!_(tp3 zjlpD7RddR{du@dl9PbPXI0F_`bow00QjNqJCd+iqG(=3Y@^w#ZjK{MBsGeE|CrFjr zA4$)*&B>hS%95|=VF=rbJXj3}c2nptzpy-(q=b z?8M*M23licz#vQRyrfTRSfKt|4jr>II`(=a;fr9Hh%{H+8pnv6110R~3y^E+!A|RD z*FrUD+i)H}(Jo$gMU_?+i6tox#OEovvdwdyKpNP$suAb9svj%$^RBphw~_TD0~)Zo zyAL8O@;O!u!wOhAFQ|*PO_`9`SLAk!*4$qr>QJq=B4ndzZ0KNHu%bCgEHstQyCKc5 zE%eGMZpd=~s@Soig4HreNialPTDw`vpxOg? zIyJ!1PPW3H+qmU@_wrswSO7)qoa*rzP^>@2)qJp7mAJ$% z8*cDz&~3#V=&JjHjZ1j)51ozoZ%-z)OOf|!nJ(;9^hJikblfhUacg){&s1IyTVtMy zGa9O=R{3h$>6B}f99OHyNVO8>HysugG|is|mZlM=8*XdAw7AFVeGX9XrMWNbwW4?I zZ|c6t&@id3Gb=Lw;k#eFb>Jh*q+g|C+h1%ugvb89^eDuQUT>sm9V)_{tcKD_8IDdpd)}?oQ1t#nBu? zB|>xFc49n2^j1T|5xLvmbPUlM{yE~i`Cft_z?P7A^`xWE$N8m&xxV<=H!db?S9$wI zkjlRXb1f*!@wVGwil91)OLwZZ)A=CEdA?G3Zhn(?3O(t9Nl$uYsa4A!J=RzsZPnaQ z0GYBjf`>aK4Qy6u1sXphVIawE$a!JO>=Ir{$ai@<(7ES6Pb%~!zl35ml4q;ttQ4Pb zMogxWX{GnII%K+P6;hhsY--yGyDZ9Y-U?%e8Wzr(sM*?)rn^YWTY27zqYwNjocgVB z!Rnpg5yBkTHUUSZr*?TQ%)W8@eRH&K9t&HZYKo4Ii?-&8{WL7F z-@f8rld2ly?N-zK!#}FHau&c-N=pv!x21V8Hw@7B!Kf{agVVR}Y`)@olUs37`b3}J z&QM3^UqVGiH9jpm;Zh5ZXTM0_mua8TYe_RazPGV@d#vd>kHnCAr=1V7C+4UhyrS+DtBm6t$IIu_F2#_keJF|2G)W~LJqb@2C=YUfW|s8H1aW84{lx{ zQyz8kqb1eUMWP}Sj^uac4?dCu?!)wKESO&|t$!Q3vY4K!`VPpFo+s8*+OksPp0mZh z9`6`_P1Dr zVc}{CkBv9JSZhYKBSRg%}JU}0Q(W%F9sWej-h2% z(R8{jCM?O5hDG>MklWLL)7Gc2Gr~6XX1Y#qh128p=jcU+0Acnx1-^&hRw9T&&gvjf zvhA!hZ4dBK8P*eqxzXs$I`+m}UCJO`DSrOxBT9Pq2nj-7#hy&+L?MC>3Ff=X#HGKB zasE2P2TUIS?S*-B`D>~rrOFl$9#&~p$!!wsD8%g=rNo+04gRSKqWzjKE3 z`w_yaP~I8p<*y5Ak)P?F3+1nJ2<67<^N4}G{CsW~*R}R@vZ!rL_z?a7y~%`OSep=} z09HaAD9gTiBPq-)J%I9QX$U>}gT)7O2h?B8N0zSh4mH0l`(7zod7HCad%SKBZfA!T z{z@0Dlf|QE&7A59muSKnV8s^p!i>}llNV#F1MFmVMB@%)zeNX2hj=GUwIsC|Czh5|ZN|TkDH8@=GP&_bpV6pY$h<8J9)@|9WUx~fL`{Als zw9@Q@D}9$pc^`9Azns{9WbJ*sZ%?=D=&(KwVV|Ylb5Aroqbofw0-zJvSC09zK|#ap z*pHEUBoC#M^~D-$OsB5UKkd-uZ)jWU1TjUsm}ye9#Y2*N7wYDWWhA8_^2tmYN|fjE z@#VAh(EuEOX;ON?!)lQ%QiS=wyOob*yZ$v$zvaa%?Psl#b#Gt=*u>1OLAZoq=@Me? z!~nS?8EM$YfBfnYLEynCk99&BgNd>3J6vrY9a@E!2b?e6t=wKNagvp!*kF=2$DS zp$l6iZjfs>PFoD}COKSVJC|>^U7jJmq*Yw-_M_ung2`X5WPiJ^{d+zUT11PRTU$nH zQK$*uYj`3V?R^!rD$9iJMx{qUWl#H?U{oG1CWCqa2Y*ejI>7t6^JQ!5QjWr(pilR9 z-3ezX2}TBfbtQiyF${P#;-zskn#7DmBlWF$KK^?yfVE-Gf!I!GTaS4}c1`iT-QhRy zYc6lP96`sC?lk9MFr+g9)dk0xFL{{&G%EMKqHf~iiH zraatnpVDzLxMqOpT$X?XBzOt;(*f_%uao^AzBhNIrSOk6@hbS|N3f}GJgk=BMDr)8 zW$?FOlOLrq1KccbfQ|#m)}MiQEPsN)b08cXU`)R%cbQ`i^8U@4D257%L${21ime1WVBhUW^kl0jhuOVN8L z_rNFICxL%^+X^LVXq@~X_x#6m{^Q;LV?O_W4HM2kj~rio4XX$YOjx|OKD0^PBvv!6 zK6A@hb8hDhD}|e@Z_)2wv!yJ$wxzdPJZ)fvS0>a8cfc~%FaSw>k`Ty)&5h_morV30^?YkfJ^(}6E&I?f&;V9$O^w;UB0 zIKpt4i_vp1k}l34+Sk(DQxVQKjNno2gGFr~9iffnm<#eLny=qxj}RZ>`TExHGbxBL z?CQyhN^h0QY)NqW>?G&S-v=_l0R9hxt}DK*Gf;u*;t@-%A=p2w^bDHu#DT1=M(JoU z^6ukjOT8O4?!8+Nn05aito4^oomJG-ceHvh1ZF!cb!3l~KzWl&Zyoubr_|Hpd+JGR(PB;EJe&QGkt3F)_OhdI#wk0czH$r9 z>|9~E&0LhG!r|xT`pYqe5(7p>SAlRM@;6&V6H(;(1)f7^&Er#F#J*LF2T%~>s23ap zpPD?DJZHm8T?MVl-t=gM5~4`tTZgP&Y$!ImB00z_^?6L6e$-^L?>xFxRrD!6XCUq+ z!alI9c~YwXfXC4J+viPFuF=sU`*|tjkvbHFe9EJu*$5JxPZLd|JtUQ>$QC>IIZk|+ zRPLB2o2Ar=IzvxN#bl+|c{5Q%viwqcF`S$BY;tO=b2oy!3BoKbv$z**glx8zy0Ta~ zggZ2bM?H2Yx#sVxHD1L&sqXmvz{ViOuFHA zINeV^4?o00cw_8LIqIaRAj03iS4HnO&Fy=;m3w|sG2@oF7yUVodm4^1b&QF9npCtf z4rkEK(x6;bDNXm5!CR$6?_yU_{UNe!>FsZPQTl4^RECr$gzE7Bu`pZogAj$1t| zuaCvc=5I6_Ct!euuyAbxqeN)#VN4v@_QE15uUe&0#ZXp%cJElY5Q7L|L+ksCf@w^q zWO|G&F1h*B@rn~UUfCF{anc>54a*bt=1rH1wsh$`xM~RH?Anwv-44WpTO6^-rc+1= z|4!PRjP{sRR`3MvgJy9KS9m~Sk%+k=uM2Nfie+&p=I}dJf-sz$v9SCOR0&g7(VTwC zMpme{D1M(A+%-$|nkKy`j-8G>;@W$xp2v)KA=5EhYw;}5x^16k z^w5XgT2a2C_qnN6p80!a1#WvV_F?G7rJo=Ys4^Po%Z{#_R^jFs*(rq z-;rrwW_zdLHSySz&lPN|uz^MBpj>uY+a2fbEfir&3*L<7s#W{wFZr;*l}x$+)8*q& zn+c7pS`Ocv?wW}>wdqEVT*favnPF#6I@%iK&ZZRgw9yDN>a5Zvh<8yNjuNwqz;L4u z(?brsZN&w4xzg^dJ+cfVa?T(~jFj6lkuVeR*cK|#;+Q=;JW1DVJ0WEZC`nwws;D&a z2<%(eX**aXEWDZHA1xFiyJ~B+lj&4@YM<`nPT6rq(PQZ)!FRc~2Yr_d^VO9U!#0)P zB%ej0Ea{ZZ53bUwhJPDLd9hQM#gSSuu0w?)3v>v@%grulNVB&(lG=(Q&eUtHvR z!zeFI(G-z+kkrf%lp+27J+s_>>WpMG1UtVu;$0#9&G%8H#uA2g=Jd7}@q0xR3_#`Bjw zz$n1N3#yf~QBb^XZ2akq|JjA@E61}H)+HPLnBPlIboFhlJvb3B0mD^*K9&8Rf*B5_zZNf!2^)srhyjG!35 z_Uy|#@4R%HUJfFRHxHkoWcv|c(mWf=oO_UG#)Lru+@z@~Uuw;!3heUKyt79KtrlTz z99Wn9_0x+LA6&uYkga>&9QTh{MLtE+H-Xd~RhEjAYJDva?=&U7kBR$XrpP4N*dKg% z6cf|Z#$m_ysoF86l#yTUsE`-eheY1waixVlx#Xq&~xUyd%@G469n?H z<)JSyDq|-OOw!EL=tfJG2wZMhxEps*j zo3({d=knB4Lrqp?Y}k(0Fq|WecpP+OUNrKZ`aBB~y*YVME7oi`!hrOCF+Kj)SPAO6)JohQ+Tnve1`Q zr}27wax;WJf06L~iCfM|(Sf6e&zE^1Nt-0q+nfgBH%OfLgbG(W$0Q>{0S=>@Sq%?1Gd4AcZhS} z1bo!vb1p;**!0h!j5OqE5ZEYHut=a1W}o*Hl)3kS0^c&f|7!j+Z2QSD?1$wtKf81WpsJvBFS!#B;DZITeQL7tR?-+cJe=bNSo4) zzu(x{e1~n}9Z#-yD!ublb=iJJ35C`v)z-9>q<5-bdKF){zSp${=S|vBDB_JfZ5^X? z>j|b*qX`3%-3X>C2p^R-l^d|w(DS{B90tL~sMB540+i^QyJb+>16^|C$1 z@`p7#S7P4&0$WKM>sAdXYnfyt@WT{N8n;q`u}859#)HOB?MPg`*>i4rhQAb`>c0}U zdvDE6f@!JFcAMu6Y}<9>J*|m+QN`KaYj>?I>LxH8|DymL^LY}|(jkDgIQwm(hYRXM z^+ z%`}A=sXJWyV)Jd-uwz~KWtg~yz=r^WxAcT;A)tN4r1_BrI$^sCy^fWCgiJ6RJCuqa zk(OU%{B|^JbDXatBx@L?+j%uPLP&)(G0{&Z{&!`pzeW`Q5@7;=Bz!lFzYLWe!V7Q0 zSo2F0huf0c!jecy6Pk@WRh9NJi8%^yWtJaka z?*T?3NBasFJ07%Mp;tBpNi%1@kF)_{kdurk*0#v^tRxP#{FweFlKvJuwhn67^$bNFTOP38wE;(dYaI{{=P*w$}!vd;=;g(~=; zAk9(0Up$f$yqwT_ssxl!g99O%tnJSq(ED-^EMUX}4mC+<&AUdnS4+*;nXj&)`)qlB z#Cd6F9$6!5_qs5{-Sd#z6DCnyQ`2CTC*UdVoqmIs3e)O)t`zzdAMan@hizJzL-=$ZE4z}+?RuM7HwZ=*JTrn6WH}O7-VF{CYkMp3^MLD6H4e)9ftiJQTbFYX;i@ z4o5bh6&#qr6KS`e`I-*XMtwT<>%Y@~E+l6r0*Ek895jz_pu5))C+8jK*GK@aL5tj% zWJ?D_v7J}PrTo+hwpl<+r-MpbJ7(rR+Pl?)={+2LK(qUDepl3otk;7KuWG`I9pBVM z>)=(mKOncM07}W{tNDoYt^}-BBqOX-Aspb^MdNw|r(k$xt6aHWBp)-bHrlC?+#xCFhqTT+^|zOIQtXM9EZvDAn!5urXdJ%Q`hCh%G8CL|o5u)ss;k_^Jk$oR5E zwaXZ6sG{9#XX6m_YPiD@I4WEoU~9Cyz!rOn+mYx14;v|X#n8gLR;#3in}CzVO%93? z!UJ=*@LG$thaNN2NoA60eW}}}HNLMmdhg=I8hrZ&TT!wV!A{!WW@gM8e}u=JSbH1J z14=Wb9%pSgWce^Rn$m+3?jKjL1wQPj`Ei-t)VO?zKK9uKqNp?w>bobsjNwb6+PtTY zi>Yg*b>lw<7oFne;)lA?wer!eP5$2SOMBg%K2+{*$Cfp-))~dCZ(!s=wd|n4R;>Oi zCdHQm9g*;6?EW}~VKBVLwQ+bmJ7LQ;*S*ADwPWv1XRR(ey}=7#hhc0}UyR&dIKIDs z&GK#S_s@J*tSx;`(HcS$TV~;6#sbZK**tKWG9%-a<8=1m07z?9J>!+==krjf36!gdW=}b9CLKsdzMH3j zY87r8?KRH_X*Eczqk?HH(=JXWIHpDWfj~c`L6=<@oOuzvZDV8kJ zqGbxcMMd+z8n{gq9pph$>9@V8G}ZWtW5CDGs=y` zsXi&=n9@$?iG_2!f34P29k*2)F2MW*g+i^+4yd`f&q@ffU>u}iUNk&zr?t|wb>ic+ zkm6IJ+eA^(S2{WhXI^sE8~TTf3V21ksUrqX=2*Lg`h@!O%sG`dv66S!QgrIOZ71!A z?IvI4myPs|LvI2nZ!)8)$2?A_d5$4EI*Qi(_Ud3G_j@Ou{3@TbVDCpEMH=S^r%bu1 z5G^lpNIg_?;3l8!p|i;1$a0aD!(AqwioEheSlRNA_XO1=^l=7a0ojy_iBu z$8J||&AaYFC*f!E*rhgBl3Ef}ufD>l%E+ZWyNgTtsu!n%EUdIErR0T3Ke3nDrYvqP zIR04EFgoDC{eUGlqvn+2F#@(B&5Jqsy~)m2)3eB=J5uxd)Tg*E$@F)ySIGsWFj^Ii zn-sqxn{DQxUFk$?snx~W^t&<9l9q8*yIy0icdJrzs{87)Z1*z@Hegs{vl~(i)dt^I zy7^tcPKD<2M5g{&T`_hqjvoFxu`7E8dvvg<)LbCea+;#m4qa6i_+Wn3B0OZ5upAZ8 z{(7JIxh&CCiGp)4Z_nM_zLGS|-w)m&ch*lyHStV2;> zjiIg>vbI^6cIk*T1JFGj@Mc1!J26u&vMAq$RrJv_ft{0?E8?u)1<<km{QTw9216*l~q>a)f^xCRLMNULzZ5m zDoM1q`XzA``HM!O?#?BGs?KRM&Z}EuJvWWgqTqAmFoE_B!+1kk0+i~B+r6r>R#9}0 zvjVSAcJykomxH&prHjtuQC^OR%hLRN98iC}9AB3|VeO^fcrDvdWP7NCEArL4kWl8q z$IFKWE(|eWJ`ELg-nrenjmWE(#*;S<^i(P_C!D`b<%#1_!bWQyn|JaakM;74RSjcp zkHXMyWsAJ9tF~UUHK=gyojCrlWB-f2?~H14-PR3)ASgtn7YPbT2kD*I=mLs#gosKP zkRBiqLFr9EdQ<7rq<2E^MS7DGK#(R0H9&~>Tlbzj&N<_pz1CiP?Xkuw>n9=k>U`%r z=QE%AOaXjcQX*RSvKF~KW%D^WNjiUVpxd1yJ>@J5a)sKssEcBlHL;c_*@@=1qwPA8 zWvhQi@(+r%l83@lw@q&fwy(a z8D-u>cxY&>*u`{CaB~|G@R2}g>HabpWTl2Zu2vJ-H8X9KC6xE?fT^FO_yvl@5lGL>4 z0VS?HetUt0)&KSo$z({T7HlhhJ*AmBa?R#%Vm0;s8s#p3k54ttc`S&SuR!=oE2U zo{v+*#^gICMPJ!^)78?=|zD%36+2MoqJvA=irwIpE^ z8KfdF-gqn}afc4;_CZbCg1f+U`^+;%<=iptMvwC8W9>j>MP*P2ghncG9~MXusFoaM z6#nr;aQX676XSMbU~kNWVv*Ft+@zOZck9?@ir{huSt$)wvzi{4l!?&jA>I;U=v|d1 z?rlnR2{A4WaWM<$ZHl!DX(*936;rC*ODAE!{9K{zf;3Su3C$~Ir@k7mCfT5g4K49? z`vGpd6Qo9WG9&LqP(|#+D0gF=33jb@f#$#jhyZRR6u^<~UQdq`j1@~9SHAKgYG2Cv z)jCr#K4*RM+GsV24pEkBSF7y$t>WX4MraXv`;)bx(6rI8jqoTs-!ZoHn4oaCp>wv( z%C#tK7j#_YJfbW>%uOe*^Gl&N{6RfxxZIY6leVqV6yu8BXEjf6>Gian4*|VY z))GtAPfTYl(54j7XD;OHgfj5g!!T|63B7>YjkAu1H$VbuM}-imaoXr=EHqp=oyCWs zd$~PjwdS4wJ*N^TX|kAGSG!9%1p7poDv{=a20pNR_2tQdZ1AMrM=FV3rzYRM5QpWs zQFKtGym01%YH3S>Ewi}^GUbB9X??ntO=o;1|#0@7X?j0EMw4P!lEB(pW#3^!izqeinLuhBbmS}m~mw?rh zoLR@(4y^$7@)iHFB_#UOPY@89o$wr@aAYn&L7kgpKS6rs$m3lC4p7V+K1&NFXm@Wn zfU)3~j4Q6w2T>%O(HrsZqbW8Ps#&+n0`&QV9(ajnbT;(6!$SHj^OrvK&Y(1#R1|FK zcodC8`vJ|T^c&+%yzoe*9~*gn3_qpfbfqxE#?5{IInskl%2Dsjq6+GfejPrFcLQ_d zhK;It&2$hIODEO2QXdI*Y=Km%FEJ+rn)A`Hk^_Q@$|(!r+nR7lLj3}AT{o_y6S^y1 zunsxnVzSKG(K+_L3$+Hb+SO$fGB6Ny`wBdxLtW`$#96#ij$o|5;xp3sR&H6kjkd4* zrg;7xqbdpCjr6|b>|tT;O^vAY>LS`S2I$E77j0*uK%C)74j)yij9v-s>PXNZu3vWH zQ@Y~s)cKOXAcC)MDwK|^J||;-I>(M%_HFDY-AUR)7b>(GM%5tWm<@lruswehD}b6l zp^JG^u0BQj6pYDP^js``0PkVbzZ<6_Qn+E@o~dnw5?X-vu1SKa3I_K^`1OZ4&*B=c zc`j;4z^LT~^Eb5fV8(_vJD$#4hJ7yA$t`5$qgUmgq^q{bPkzB_2}y62Q<_nK3v({p ziVwr6I6Tt>G>Vh;jng-=ua^Q7FbOSEzC2C0o!nKcM_>3I7yp>L8XfKuDXRMY{ed*y zZmqJ>BgqcF%fpV`({9LbtDaLY!_fNa^bdo!gc)Jni@D_rTp_~ z;ken}!#ZUBC7$+kFE3tECDP5fbxYymy>Xhy!!IOiwn_v>A5?~}+wVNN`SlLxtro8E z-AJ*(N8JFDSpOSX^1m=DXo)DleJjqeQaIb%q-?CP7atL6{m9cNatNZi#Rvm8U59fo zmAu(%aKr4hyC?s6owsu;c#wZXhC)V#nEmoyeqNI2mz6-SuD=cf68)VQ19||!g{!-L z>WvMfTbkQr^t&48KflcJq*F6&k*91Y%J2eV7p5Ug8C{St6s0gi1AB_*#d$BW_}H#? zE@<7`H@L~MWICflg4iSH-GpaR z+7iJhf(S`-rr{2~mNtS}P36(T9yAB1Uv#O9tJ#V4+NNKCVP8`;K;rAsL5h7xhBck5 zoKU=In|^sz5I5>N=1uaj+wKVyt@qm$I5u7;JChSAq>1lU@| z0CQ*Wk6mMWvCJ~R=?(QdI~zeV`u1t~po?w``d)`u>(+;oC67!H_BAfWev0u6oL>u% zO#0Nm!n&UoXQVcJZ*IKb8A&^L$xZ6fdF50Vb)7f%T=WMv3m+%i-F+n;S$%tE8przs zO7;gN_WQnQ53DLxob#Ec5IMT>&IxARlQq9?ghwqhm~a|Ff?at>VfFbEItoSnbHZw- z!kiu`Qv5@73M{~kuBr}8pLD5o{Uv`F*H_!#JK_H1-4frw#jUH5Y0B_VyQy?z;|(GU z+8KLBGbPj1jPk0|n*trTx$+EcVMGlYF1cp2b9YlD_SoZ0orT%EyQK##(gwx)K1tUX zLHDyf;C4YpflFJQ{4E`)C*q?7*3-n~+-Bqjc;8}9axyq@zYly_Epx`q#O$_1bPrc^ zfQelV)3x2(LoVqbYd0)tMGP-|dZ~`c&vmHU-0(^?Q|K=<Ia#?&`=0IEf6)>;8g^X`L|T*NGq)kz_Mn(UEg#8Or3R8Xr!0I{)lV zoD9N78U={)LI>9pI;Jm4Yba)id@Kn}jZ;+^FHVU{t3$fwj_pGHAe4$Uxca+w!wu$6 z-WE$~e7n}d0<#b8+}Wir<)=IKrn>SEJV_J#PJK-rw6E`MXJ zd0IBr2{(=o3$LY(FRM;5_mJ^iGRduj!XoVWBsOVcG zXY~~T<;(V-5+DUr>g%+jye7OB0I2?wFRcCXqjs52(m45%w7k7Nl_NxcFYnfNag2vX zZI^Fv9U%#QMrMSwD>1!oUSh*svCUO6NS$?uPD9A|kV#c~yW)tht?l50aw?|sYA%xk zI2&s~1iRVsW$0$KOWSxZi`tD5uX@EvO$p!XEe{CGldw)8!)-9if?Eli_(7qQacrEgco#@HCEwVv#2 z?v#B-Tn5xwBv1jyB=IMWsrK16pN-5^IC~>V-gZ(~pYwg-zCtrbt*jT8Z9qnv)i@Tr zTlU$hQz@z$OL{{r_r@nJoiMdZt{W>wa|P4Knc+C78#tVxh`)^s`B<8ubSf|VTDk_vJAIDif~I{(_}^yrlZsA&}p$<67U9nO*L4exV)|}qJ*AC?=OJudXEBVKkHeAw6J^MY4(0AH!_5BlVwc6v>Fq@`{J0^sjrGw?`~)7 zG}34K2#v%5-N@tkM(|ziT2uo^HeP;Q{_c%>bBWXwM9x&!_ttOeVx=@>s{*R52DxFK z4|4vA|O$2h$ZDP3P+m$Sx#6-F0_VZ!d!;bJi%6xAyH@s`y+*Ttf0UuOElp zxZ}b(cnTWQR)l<5X(%M2V4_)<%zi=7MsURv#5OH&RjKc&H`($apUd(a!%>b!voxQl z74>SO`jrNo(E?h0rgZk6V!#-31p>t9kAbBAw#0rp&ImUT!F@S^B>_a6GAKTJ7I}IB z5FxqKpamo(PG*H?+#L(YycUEDzABlxg|@o2kc!}aK%W8Ew9}Tr`?9QZu7aOK_WYA) z*Mb?j{Dw}iH{RD{jzjmUJ5ZehV+m5DL|QEp6HJ%_Bj6R20x4>8MGkcZ5_^hIF^)kE z&k01>wGMCX(7Ln`27`cp_inA+749E%i5UqlGLNa^cHy=Xgd!6r<^kIig#PRf)P2x} zwbb(11FYMJhPt!#5O&f|d-(01$40Ds!$h^Ayi6gjRU!ox(5ZeZMY!(5kxD&w)Cfm-cP+UkxzZOxsR~uC7838UZNhciC+cY+m>_Q)-wM)9*>`A5W`Iy8WA&zic+daOiU*p#Y zs;U(ZwQ1oFg2tMp_Fk=m>YPT#BN_olccheOv{V&8HbpG)@}5d@4a}Tmyj9`v9HJ_9 z%H-Cps*SNXbttJEqC6|8R3L6=Rt#9&reB)>3F#BU-R0$W8`UsnQ70r+n>r$_ z65$+6M{Ok`b(wxSw=72-9i05AU0*Vl&R*$Wzz`c-0+oyz##+2=2MUJsIATp@AWMHa zZxwAmxdRhaRYCD`w=I@}9bUln+w2wgNP(I8X%($@N}T1~haH-D|GO?;qKx)*tA?M5Su zJ02{meJmFO-2ydpET*RkglFJ(sSAq1S}AL0EkyjizO1zw3L5nDzN1 zYFL?!FmlgCUANx5)9oAWrgQT(s~eBmlt`7qa_8Nbg&%>H8V91l2uDC8%}g#0h6cA? z<%5D%dj$dFLIP*#iorunGf4rOCk(_*$DmHX}cJTpR934JZ6 zq+b;_T^#MDN^_vG8~e4w_Vzygus5*FkV7OE5<#HZAY8(dg;+)z#p(ubwoPmzuSGt; z6($;Zp2e1TEv}VWK``_+#HO z2*T#J5ar8+D_^$1*n0BZSg#^p3OViJaQ{n%3&_YYtxNRekj=m0({$vcySxPnXe${R z@kvo*x#IaC@lEuHz&Hx~`puGV?Vx16w1w?#EQfAQfSDa7-_$Kbt7r0(??3O$-G(SD zR#?(mP%*WRYOAR&tgO(jOu)4G3y>*-gsI^yFUPjI+q4%`kPI`0I?JoK^g=+V?;hbO zhF;IQiQM#-bv(hV^&aJ+>be!BRH#Z%wGr39R3^43KYqHjEAH~n&aXBh^39J>;lUD4 zhe)M}0*$3%1S+TN4WdT2Eb@e{9viDQasYcn5H6q4k*pweG{p5qUe3C|0l-xjmlT>y z(t0*@vlAc2DI90w>P^;4I|5=O;L`|Q^~HG{Ut7Rrz?TW( zz1nz+N?@*{c+-b~pP&yJXkmh?S?4b;MR>?@e1j>r#0=+&F)+kNbhTPy%nFZ|HAf4t zE=RK&ah!KBd79Tuglp*uzqE*DydNKLp43Lks!|zHn{_Y^4bsOqVRc%So31^2w5nf^ z{II%ku}KC`hI7LVJsf_HCC&C(viBwbQ5m*2`c2Tqn@>`CS5!~qGWq77l)l8aWfGfx z;&P_mZN3LKN~SinryfX*>{WO;+l@vGeq(+MUtpSg`Pw$aViX8ZmT=45&7%b8K6v!N zTjZjE1M|x9v;K8|%P56r?$_bEPGnR|rN?DOA9JPXyf-nMz#wrJzkCy#fkV@TMO_>n zPy${&dL3(k?P>>KUND2Nb%<4*q_x`Y%Fecfe{3&Hqi=yyomwOBP)y2qiwQgnWFRBS zd!!$_YTQ(G{1eo6@T_C8V7yXR<7?y$fooyfYB}MRI2l$u$n9cZsZTL3@KnDnLNMv| z?D6egZ6)c58pZJh+UM~8JDJAh&aKp8lSSJnv5Dqi8g&wgg5^>Ac4H{IXU2mwwpRQ|=>8CT@37s%OeeoIEhG42- zGnSJ}&b9`qDG~UK^$6jq1!Tsm+%v1k=0W0_ZHWib4Pd^*NW^SWVO_`*JD*2WvnF3p zIkZq!7%;bSI%PSXYHUk4~g4R^;OhQ3|UU0XmT2~fGx{|t~J@1VRDyANf5 z6`=V~D@=B!x|X^^w>Wt^wLcHh5}{#bDqXNJ2%9|b(p8tGq2{wk=v=h*E~F+U1OpjLMFqu8bM7M7k%+P`DvwIl6I*ua4cWsV)fbcOS}1?br{ z#rgY-Sx1SLAwe~sV^P^4%~M7=4S^1|m5Zrv)h3G;S7fPZsEuOeb#g`7Bx%>gSPeCT zyp?{n@8kc&eSZf|M+9gi6z=;-?ih@5?H0Xw60_>jBKJ&5po=*66(z{CYis9J6(M?- zCr{(Hyc3Y%v1e0d{H{)~6=vd-ZvBIrx|=_?MeU&Y^C>Za7F07xRdNQb&RPZr4lqwL6INjXbe~sG$?10Ke7w5USZP($2j5yGw z=APKkqMk8jdQK{FKA7e1`4`y*ovD zx*qRdRC>ADZd!OtxO`;4d>iz-g|tqaNzOLXyCnnM0KptitoF5>h^~K`*IeuWXv6sx z*Xg{~e#xEH+dz_m>@I1lwQ|qqJB|k) zk1+Lb!l>yigbb|k=F5G1pK!6QU-}Tuq_u%Y?9te>Wfo9ZmC23I?cWUVKd#+qGdn5i z-gkD88Y*Lk8Z<94=i-Zaxz_vBn)Jox!5CCCis}j)0ePPtW(wkd+s9eLewE1T=}S>K ztA5RWQpkCvISXv^KzeVGXP_p!V^yzq%w1oN|5)jILWJT)ggEwW=P)A#FLLRq{!U6_ zb__WqRZ92^DKEryhgE1VCb7q5B4EMCV8wN%!7lyfnXvh*;$`qR8#Tnc*y`TS5`SL^ zP6IukA{2|J7ScS<4j;>0_&Fs{_MLS;Vv50qgzi--!`WxYwLK& zc*n+gPkvC2ke8>ERquiDCt6B3Oi~NzOJ=6dZ~p`t3-wsn(II3OGcLi^H}x7Zhe?NO zmGwD|O^&iEdpVpjQ|{M1pS3Q!jFf)}$P9FjfRY;AVqJ z?u%079dqJa!>!iH@~)O0ux(F~%{*1uwumII;WQ*yb#{!S_VE#k|LyhAz4hl)%wmUc zb4!3|Rk8Z&68Du=X^()C5JZWJb*JD-O%`FT3mUKW8JsyeNiSb|F}~_3 zb2#vnZC6bwwVtc5tg4GY$G0D76RfCTL4~z!ignHo3puPN-}D(yEdX1R*7B~vDV$dy zoX#Ti_ZdrQ;F)x4!|^wIzJGnjh-&07iB+*#0;>HcomK8$$dHN;o{r|u3Gu@rS-~oN z=jc%GeW|4KYzhVS)Ne9FLnHKD(4t~gH@!t0=W!Ka@_EG$;OpaLH#Dn zuhChh6*`v+{0kLs7Y7y>4hf0AwAFBZhEr*2V!_;qtRo9>UHCl38Nm}a81q=nO)ksQ zapH5EdA|9zt?vcG85>?kO)il`rNT^!+Uc2UW_incQipWcG&J`7+z0r7tl zc@P@zt`cTkfu-XzE@;S_b*%R)A8JrJmB(MlYUL`Hf5K@hZI49=?6$awD?TgbO8oTs zaf})_lT3+Zlcs;W=2)tPPYNvmP{uJBp%V8Xk^vlibUj9`Q?$&OGvyZx8bF=L^WuCk z2lD{RZaLHIdE1~yV7FiQ0492;pN#mGQ5+=U|?DUR3F9C?#+>UXtTQW!$nc?SlPF332=>6x9Sd{w9+X4pn5!JZnfZn3v}yDTD$z{P#S;t9W;)MZttNwFfcStvHtz{`5Pm- z!xCq*7dKhrW(8=3^lG~m+u3fjz8!wp3 z0~OC9xG{mn$v=Y7JeiFFM{S&wSB6$H^sb12BD7*4nDqNTXR zx_k+f*r4&!c7Tgl-)d(>Hci`yOjm^Q>Gi8@*%k4Dp z?V(2V17owYq4j|gVg|RW&o&^?o^)@_W(%xhD3HKv?S*X9W`^tU z+!?Fr^X?NXy312P+JK8!R^llv{4_W)Y4U>$~3EvRU?VHTSB9}(MIaL@^LIH zG7cJYtd|zy(@j1CZ8TchjJtQ@Y8mX@@bq(64rD~6yUemI&RblKZozS*xwoO15{-=J z^z+!L*W8$+G=X@J6z?vU`YaHeI|t)zRFz*thA8)Yknbv(-ZWMs;3G&=~8 z3;yWi>7v9vBZ2bfkzyX;S?l8n7w#pnc8|(&+DY|Kr^KIwuoLpD6 zbzj01acQU6fb(inr92^>g)AtyhjT__Yr5iN?1CNao>g#r7YnoN2Nv<|Z!16xM(YkK z!W(OU6s#zOzlk+VS{z{LVi>XIweY1U*j$M2T%9AoAjQCbPMZD6D@w2995LQj<#f}i zoQ=WIhjmB#eWx?~wd27BfjOCXDjuimRg50UMG@mw0398m0mQ1MBrM@XvQA=K(PWlV zMLVe1m8VRS_58!|yX+i}I$2T=&CKM4`kXwU?+MhZvqo^!?=)5|0BqyC>6H|8FMl+W zjHl=S1esZ&=n<5-AoN0TaulE|rXwktAUsyXQ|Lj@RYzP?Iv04o zywxgEUYSaobtWYysEr4&(Wl?2AZx8DZ^}EO>m*_?nEds76T}8u;hN4Hric5mguJNq zv5hv=b%2@dhRuc6a`}+jJL&Le=}R@`6t#Ts*B5z*(q`vlr@go*zw}moPEDqr&Zb?qg&fD;5^gye)&6Uz<_4C9cspe4+N8*LWGlP4rpIpyS zkeu~0(LbCx^FIIho>gD~U0n^r7iX{p4o$WSe9@rbnQPJ%0W^iirc7TYz8Bj3W-227 zIFP9MD3g7F=bIj1Z<%8<72m|%tas)6Cs^vXS{XNeZX6n8;VWP>W4anwBnga1q?+@o zRvC-)xRBKwQz*O5b$P{^;&peoSd0~P`cbN_O8UF;Y_`o4#~peax&zCxxTPZ%qGG(* zS$-3j4M&R^6pMNz-gwSAKkuszwfdxo6O}p|P-tA1=H2IDKiL$?So|4R_btboRWAs$!pUs8zaMyB9 z_C76@5Q+O1{Aq_*u6!V0N6&IiaJuO63MG6*Y4$olSec1-)XqF!gzoSnvR^hV_ez0m@u>W?;YpT4mDuMhhYR9 z-HMAwZJ)bmLUUAr>}2aHgk~1!?<2aTY1PgkTU1(Z_3~}^CpyDAA(n44#2oXD{YMhn zNrUh5a^A6j+?j)wdtHV1-*ci8k{agpa-0@KuFVLaE*b<42(58VuL7)e?LwU6vPz`y z^@V{PwH$an|Gl!n8!qz7oO<>f5BOFhB}?)AUprSTN@Xk|=Shz9=XWye*Nc9FNZiKT zm2=o=<91lm)9vlmIA`yv{RBPv|3_wkkXMEvtxX(5H-+NG4H1g2Rn-Cu)?$v9Ul}Y1 zYRm_>P7P{VCt`Z$VOE1{1HSQPdyNDYy1#r!Zif`16_{iHg^m z#ba*jJ{`w9B6GdkOlO}_L~4Sc9-ZU=wc>-B4i?Q zz&d#YRFoqi|8l@YWINp;YVAz835kD&Z2dk75WyD+{75t~5M^2AWH^AJ<_lScV9&dh z07uf;$@mFkhhXX?07@xSWvZeSt56i<1bSsaHPOQux9{mdenJkSktaxrxkr#0R zd(?)!)c9#*o(pqunDBg3hxU!8!2PhsSDZqsJ)kolqKxvWwlfd3L9-(fLJ})C(^P~m zIya=*XlHckZPLeZ{ufV)_@i6cACP5!`7L_$@00*>hS1|l0CiEq)lD{%C|y&TO5G5p z;h+KPtvJrplmwaS$jl-y87&2i#O|kGJ!>7ax9jYDyi?ExK4zct?WU$Mf&K_Vv;s{V zrK^xubgqdZlmCl7L|&YfeNU(R8k6*p@oLB9$;9<+ZLGGyTY%oXQiBFJL(=|Q=(V;F!r@he3msXn6Ic->qj-j;Bmz4wp(D8@tii~MyqYjFc0n&tZ2W1k zm8A+GKy#~^TV5v8N8aJ;lkbmo^SOQX#*dp&9{F*IX9Zx33wd2-3OWtM!T)K|@ZWy; z=jy$4V>Z!IZNdR{7C&M!^)1PeAOT0m=MD8i>XoWGCb4bhZ&*J!KXwNZy%M=~)#A5H zc@SRTX|v0K!n!Ojr4^{h!`cQEciJ+j8Xl1(q)TB7b$I!$XkMMT+ERV
~jwu<*@ zPkyBsf^)&_0JT_hJar~s4nrSC75DzTexIF{{pHh7G6G!%av}o#o|aFw)WnHKepA}! zUyF)l5vIN)^bZN+Rmifrs5fg3bqTEzODS`fa-TWMv}KkrH^@1wwOhIG9NRWLAS?UL z74xr0{;jp8J6_ReZ>QckymLRp3RN950~733?9wT!3}RnQr5<@DpK)k>mVlr z;XD?vI+~3OxKSG0Keee03toArj_qO_r~Y_7PxTSbSxZg28T5}0x&HgF_z!%O?0?K8 zMA9DSK9+kOuiW_5OCP=duEKsA3c~D6iyc0|3~j}tuX?vCGGcwJmoB(%n;&>ujP(+; zFizb3beSkEPYS9p{T&#q`NQ?!Cwu?#FQz(=5QJR8Msj*cw%v@+YUW2ZR^zzr#$Gjv z*lvH-iRU`TqTP2v&CgjwK_h>6&Hn!-fX5z-2`y0}^pL8AF5xH?VBlwIxpw--W&voX zrtwP)Hlxe8xSy2Pi{EOj5UgyidW^jVdH6e4)Nk8;?4wxq#wexq3vOi9(e|8a8p^SB za!k!Dc`vSz{a;xfhBjzLSAZ|$yv;(Tc6i0k6?mDZ<&vF7nBW_wCprqz<(y9WicgY) z`daJ4v6Z(Jn*M1S@W1u5e@l?)xAW+KJ<1=PYxZv$7ys#L|HO7O%Gmzh*-6xdADXiMF$@5D)Zi1!ld#5dVTHen#W&r>P4cj+EEoSjO6#UO|^z~}Z#)!zaUY`mY5(Wqt8|tv6pXNDm5lhE`xyxYc{#7+cN5*)O%qPhCWDUDe-#+BN?}M@@yuVfUnqY$S1moLAa)DO;iXcoZjgzxV|GZo*j!|yFOgB^L#X?CUV~Tk_Yvta^)UT zj@OmLe*FH7msQ1y=KhI6*xxqbe`MVdVKTOd7I(r*PVeExaA`o~%!XRF69!vXA09ni zk4_bc^$e!1(Vua;@gsGqp?_J}vzP*|BxGi1XSX#vtJS5*7c^lb>j4xi`}c-$+EAO4 z6F`&+XlGdf5XnwEph~m^86x;I5Qu?xBsP^pegd+MZ~@usd8&yVPQdLDeu5wbF%|q> z0Ezr^=O?HcdHRIVfgHjiTfqO}Z~zY}7zsVO0KGtbWI-vayxDpa5TK+5t^}P@8@e_oM0_aznglhl?`1>{F zBJ}Jt@+oQ!4_yqxb7xdco-rYT=A|8gGrKH~Y=>r?kqG0-fB|mcEd$^eLHNr6=DK&7 z4B)TA*o89!6mJ=Y3j)Yp?*V=>pon+aauy1}1pq(IAA(8u9fj>%5afP>P+dPk;llus zOF-^lMqwa?DQGEdj{w2kCD07b*b%}}UC1M#haZ~^{Wp(|?60Tx*Hin;)_&&-r6KD& z$=#M&b|kTDx~x65*QFm(xV|?{-p4@Abh&uD>A|4b==C}bE44g5Upm($ieKw%{q#xl zaIYy}bcW*KDY8ZJ^ei5)Pz={Tbj7)#ORlVYFgq0Fz}*LDc1O^X8YBgbm%`;PdmDTw ztMH2Zr-wCU#Ms$oDZrgr{R9OV15INYjCVa1(>~mSY#l~3an&SEaXu6xrL8Y3UTrOQ zRt2C2os=;^g~;(#qj$OD_K&#?PF7Z{(}F7`EF|c zs=gdZvHADznZTq{>!InUuEUTz9K3EQbZyji=p3$iv_out)A~k~t~I#tV$8+^I#D5X zDmnP~%})CDfcuwNP|3~C(c4}kZG~K%rgK<>rTCkxxqlYhdw*pf;uxGB#si(v3^XwJ zEbaubT`t3{MoFwB>>AG`@Eu(_PkBwKBZMzlM*jSRn*hDJ_rGik_M17>uf#3?n#lcg zjzX$=Ns3yYfts_lo1#uOP@nA$TLtLkR|CZV^b$fZrr2e>SJgzCvs7q3d)Z>D#iS*B z8O~7&(g0rTKR2lPZ8Q2`em+uIsJ!-jIDea0ZWvzg)7_l!ClXoJlSi>vu3X^I8C+w< z5G^`;KOpNs65;P+Qx2zbSo2ero&vjhGTgOAll&!&Z}g=hM`%d#&EXywXIk{q1|w_U zPmpe{qRU%w-G6ETzxv_1oRUvd*2a22sy{szpf~nuNH7qsR$YqFkb}G@`9jtO!)TZa zKb&Zy!VN7~)zcJbW9_<_OM*2-HbL(?3vl;^JdfDk5gi8nWk`SaTB3eDYhpI5Vdd$_ ziqVRoEmil6xn~}z|EPuJ$^~&E(x}XJMaW3uz98BC%F&8BMUtwD^2&gxf%CLPI^s%1 zdbdCzQUG6c{{6m~*Un2aZb^X|=uDg04mYTC$hQ-W`&N<3D}Uvh8i)B;(NeOhFXzD| zaE?|aC7wSQ&e1$`wJGX@jYwi)p?%9P`c6kM6X$M!SBV$s>+jtJ|3nA;&vE1b)sBJ- z>sVk>U<|oP208KjTZKui{Z7nRz{)7_;$GX z2I$oN_Z!MD5>|uwhuCEx1$^Us69-+iEOPOh=f^e{&8;So(08h~>wh2I=6`+qS&y@Y zgY22JbT`?@-w*KoiBQ#_y~zKJ^GOVIDn}|^D!gF9!Tx&c3z-CF51r5kys{afzX4tS z#&vT4>-?_3&Lu& z&ohj_*arvAQ+~!w1_|xFSPA5>6^csXoY-0v2rWA7?&IZHW!k<4dGeQi{HakE?O3u)!{e&oZ@vU!o%gAKjGEuoaN>S>F}C!gMXwNTIZb3DXvg{=ye$l5*5xpC z5@qS=uhPXvHL@iErM2~%$oOdOF4et5InY1i;A)|)53P-SVIIh0bI8uNnUi+AVpD%S z{(RHTYqgiH`mSuVv|6Rrmxu3#g3ABOasIUJku1~~DBn~Z0nmvB0M&B0CmaqG;4%!~ zt`itA`%HEb#%G`D=nPhO$I23(NR#mNiSmLr2t*5rCu0O^xZMNXLGHSwqD*bYm5r*I zC)XL1LU;mK zXb`Ei^X6?daSm6+0j)P0v$2AK=`A6r_b-1w``Rvc>2-BXiwAy;XjAu3-46eZfb4%H zMnVb=!<(W@KscB#HaFZY#h&}#aFN;W$sxLwRfHYr(ouND#Q7G)>354w^Lcm1|S7U%N~PFkfrsE_eb{V|vAFCnC+XUKf5`n_BvsV2yW<{x%ff0^0udvru-WTn(r3uIfx z3Jt!9d);+X9F3-g7%?XVyUIxrNqm3k!Ya&uZPd=spPbQMdOok0Xq#CvIvb8uEBjKR zAUoftHCnV>Sy53Hu0nhHV}REc3yEtoqGn-g;S;^E^h!2OOtW-&f7X;kqe{b0j*+9P zI)JQ?RrQ0O%4ZO`^_9SX*>d*(Fel8W_K`3Oh?8ra-nCsQ_Xy$`JJ7yt&8&TxUWBx9 z;2hn8yo6XJKkzmB-0S2oGo$)M7Z|qjj~mJ_PmdKAs3?M$!X~ss=Y_;;AYMuB&|f2rr;|&98;z0h!J3TO$9%-q4@uAz%IkvAoBXxL`8dCl-iT zM?PTtC%|kGLOI`_gg$!(yQVtN;H24Rc~;-__zM~JDVJ}5GGwa@GkZ$Bjha;ynzSU0 zVy%L?1ok$P!&d4U91}YO64@0CyH?#fiD$q6jbP~S!VTG*8eVIa){F^x97~qUkG#0E zU3r_ELWZ08{R)8HqcHb@IeDAR)qcL$felq))Is22Cv09~%*`qiO`%4YN~6Yl^Q|Wf zm)(0ZbHABaiamc|7yV~2Apfp={_lSxDuX3R2Q&yqUQsV-w`36RilQ>KmBOsMS2Mp^ z;*8(WxuZ;eazzcTXO#Gg9P~Tp$N!0I;(ug}`iC=R2sK<7?a{71({@-m)+B-Vz_{n4 zcY;tC4tQNl&cRLe-ekgHndeSYyWtkyDmtQt_il+@NUR~7t@wl9nP_pI{VY?W9yv?% zOQQrhP+sO)-i#^y>00|d$C#PbmGx3m*H?T|@8YAMkw{7>T~G)nYxpae_=8XVPy0F1 zURR6u^jfDc_$)_Jl<);fEg{HuvetFQ$cDgbLjK&nY(#LBI;) zKXVS$xy{sd%Qp^z7p;K!94Q+27zShsOF3yXM|JK zVNim0U2Lb5wI+J%2WOp!pWr2b?U?K8nXhXsoZ6$_sttg==>Ayd`d8Tb+nG2iGFW_b z4t9I2B2CJuW<+Sm{zi56nBcIPiP}|)!tP><_&Y$4UbtU64{*A}6AK(2pls~WP35aB|^9AUcc7dM97=?fi)&KBfCQiXcU>RB#$PqLJ z2qm0fVR0OK`wF$f!ScObPgTnxsq;CkaNBwdQYhhwmVS?-KP}mV1qO;c$t-2 zQJ%#$D?gNI$YYb5X};O}S(=^A^V}=(zg)&2elx$04*;F)`MK6!7ORzoW^GpHxh-7d zG)0P>pY3nAo(lG6^H_rMzV<1DIvchJR zxN8{WDNgj|FY3-e9)^>J`vThG5DD;f$>Q8X$5r9B_>+~x$%(3`Bi?}l7vc`%zrIgb zd`GkbDK*$eFM(&uyJps2OCW>1kU`5M;x74xhm$*Us;$<`Q+L^$&sJU1$A}Ik{^Q0^ zwzhz$WBH|r4C-IE(>4nWNLLt>m!mJR$?Nf|@)uhZcLd$>;Qh{ok_8<=aW`^1kii*@ zO`wf*9$eB=0qU~tXL!z!PPu}&eiP@bD&4f-5xp9b0TdJ;kUDOg0CX=1&gWDf7rKUC z3usW_4o8?;Rd1IHcKi2s1P}Ib5hZ1V{2v++OAB1+03o{Ufggn9J|}E*t+fi1YdrVe z5_$Z!i|3@<$nPhp={(>xPT76Gvay{NI!I_<=|dluqqkvq%QfnkcDr1j5;t||rrP^# zd0Z}HM4JF?ql1j_1&AW8@6gQEs9=HsF3&n%Hf~|6=&`Z3^KM^h>Z`YzRp;{isQuoO z_b`FJ;+(K7Ex=@z28JBO9Sl{Wn*y<+5!>MR*&d>|oDnRDm``=+;UIR`A$8EY^?xl7 zK?Pe%h;l^*WKcFd#Bv*RY?(2R2vr)rOZD_;Olk$Yv-4c-p(W}f^rC1R?6KvG;MW9x z#BI!vj1QZNTov9E<)y+ag*OZ?ch~nk;f&C^MJ`PRR5ZOh{-eSCVbJWqLHLLW0_;#5 z5QtSBhi*v$V|Bid-hy!d1hue1S8CxygbqnSGq%qQboS!?r6C$z!d+nQmjsj^m+~;Q zO>KCatFf`}9sYA0bG*C4AV{7{`}QqJB8(ojSVEAgofLY}M1g<~zi_qA;-&o2RQA>M zao%@=u0seFvhIF{*-Mb9A{%fj7~!T(a>Yfg82TtsVu1ZK%~e6m()tJ50)@N(hr0I; zYU=H}h7Zzf=pBMI3j#`&8f+910w`UYNC&|J1PO#*q=SHf5EP{&(xph~pi}`xKuQ9F z2uOk&4wA(8{GR##`R03O?tAWe=9~BVUo(eu_O-9Q*V=n66EWA@yQkYBwHR;uUn_Xc zK@20>pD0*Bg5zoBfe#`qSubuk+TFh1LdZqLe~e~`l*+syB}spDo3h%S$@7@OwnNFsbq-Ge$Vy;0MlJSg^XR@gYpeM<{qB@>?>_# zr+3#L@{x;I>VSyxBUaj{ZY^q!t|mbOIRu9_Q@Ozq$MEZwTL&s)ep~A{UY8Th#zn5& zIH7|0^RW~rL#@ZHzM$!YoFp9^&~$Yru9X$@;>nZmg}f_fju z!4|_(3M=-lE@{+|67AoD$?CHKVFbs5;CX46m8$S^O}PtjOn#3{ z9S}K+vL)+K)1ZSv(!}pyrhPPx<=_r?SJ64llh0}eRg5o~uP1#J<+6qn&ty6`c(O zH;Z%>LeC$uT1nHN!T5+o3WMAIB^)zdLOz7iG`62Nj9*z1mOm&t3ltYuoYcVEPmH;0 z06TQ=(9l9(Ob3dF6k6W;3(%t#N*6)jPd1Mh{sQ)`g?=?t=XrN{r3WgqxfA@|WyLFj zqXScLz5@EPepB5#XqUHbGg)1V6@CuAtx54he7$1A&n}wCz08zWGB=^u`$qp0M>s(d zI3kc=)?UML=+fisU`DpmeAUofNq<}L$T+J?rudno$k1A9CRZ#Z1VDk-qX$jv9X_M) zftVL2WmhlX`>rpMZud$N+Gkc26}cV!_+k9*^Xpz>#>`toiFI50avy4Suati0Uhye> z*&Lb|YSH>_GvNwHdc~X57vK)S+kf4M{x{j0ZXY&0nS$d*2Ol|8RH*rp%wXf*L*(tCpqnE7`ekL^$TGp`q02imX7f-)WK`wsZtZ?cPfzbwdj4QUNf zLaC*=)e5XQB-B;@V;Il$dYPSAD+H6sm=E|+zb7*xXtEx(D4ZT*)T|U_n$;1dJO3B( z%~&kLFP2x;P_BZB6=x|xQJuCcHIds85G3x zmG42w`vLgb*6VK6SFR~J3SXzs&TJ@tkK#-X5x++V?b?$xX>6QFo@fKo;9jCGqRI)p z=>1BrQBq>6dFqZ^wubAsAK!j9moaFtrB?OT0e72dA2Dqw`kZW*k-U&HcJzxc6OkVT za7nxC!37sKd6u3rh6kM+2O^))daJFHt0Qz)m2N~Qj@FI&Fw1@4!p z*(cjj7ZqL^Gy5i^eF@OvA9w8&YJS@gh@X;DnuiLmq*JI07IKD*QfXyXM&+Nku>MD<3LZ6$Y?x)VSaSbt-oJ78=@jGSJc1mL zS-6i^!|<&Wv%$Mq%l6kt+}%fZk~S;YvHi8}&hq7RpQ7~g{-afw|4ZsWSSHkYLki~m zKB5&=ND-SV>P^4?7XW33Df!AGe}T{CD}U?*p@jdKjwD18Xd7yyNm8FS*K_#HH5#Ci zg!76&K8Q&M*Pb0!Jx=~<1F1*QmBI?3-+YXt>xNUTNo}bVKN53)p6D-q?$(BczF~`K zuhye0GuN(-i+MO@1&mnqMBN_wdI6Ap1{F>A)MCh~!wWwYOG@ma!KRxz9VC49PFRY1 zJa;q-W%zLQ`@S^r@&f_TP`Q+8!d41+Lh&Wm?=PeAmydn#5W?X=XOCXF{nFDbwd z9$$n}@AK0U4~AhpzBoagH%53n+xhL%WRk9But3fG`2oY#;X6@~V2lh;=`U-3x}&?R z%JT~3a3Va)A5F*H7s%!t(n@|O@NuH7toOd&uTr=2f^{2DPc7Z2NU_ruG0^{*EKhZ*DwYIh_%ZOVlu7@n}; z(i2{+LFwZC_UFa^BGb&eiFHAiC97B&R~-GweFG5$Bu;o#gj~q}u^@6zVlO@rUx4X) z?iUIN_t}77lmA8q!=*}wuIQLMeh6OO>y?ySAXA<~t*^(cU@Xi`s!b@LS?-nMIzlRI zf?X!pY7rY^!a%zfTk8Kg~Lm%hK@|&i|}i7n8pfn;?~X zui`Bb*%3K_YZWXwvPUUfHqQMja#9@SK+d)n*+T%_8^lF|x3$7h#QprXE|7^@clv}SG#pP}pN(PVXcT)0feLcIU z9QZ@=*`bVrZ{{#tF1$~F3Y1`4aWfE~qwt24A41CLfiP4PVKW`AP>zZrk6t$ux|U+q z>X;OuaWQ*Tdg#zx@Bhf z6N`MVfaIymK5@2EBf?RdZYJxz26}-smcunCQ(Mqz4N328KD~&#?~w3)N(1>g|E|Oc zVt$2QL(eJWAwZc0f3SjINgQ<9X!{yh%;I`8-Jflu4vn?gvaJ`0+a>e%~eh6>p#d$IEf zKYD-iRy51^8)3UqsP~(d{B%E=cTA2fDaWb#uhjQ=P{Vkr(N5Mit?aV2^VQhn@(*|{ z@vX$`^u+KPBa3RCdvp_#^BBo){>Tr`Iy2}9NUWx1CGIy<3rfB9NgpF6I{J~7^y(X{ z_hVsFQZya|_{$Zd=?b3{m`FC7+S!xKzVytuFyTp>jG(?<{)7eV4#Sbb4j=)w?_U2| z#j>%&h4WfTbm7pK27eMN5_ap?NkRjTZ(jXQsaY%wvK{4YJ!#4@ENE9;AmwJzR~<#S zrS_jgFaJ%=^sjzQ+-O*t)0d_So`R4R7H^;!d6PI)RbhYf{F3gcMp<{g#xX`9k`v7Y zCI=8z*D~U1$HDx*y6QoE1aiD(arSarGNc*Y+#6 zq{TO0U^xDyfmIZ!W6m_d&WFaR^^XcBu6awV70W-8RRzDf^0!XUiMvx_=z?EAV(N zYGm}I84#IA8^*OEdJxI_ti;LwnzXCKz4|Aoe;oy!d*Cc+6wen+;weH5-1dScgFG7j}TAy=m`Kgmfy<> zBd^Ztax3WrlO%Or1#qYdCSg|NY57jgtO?LlBNLSp55 z&VV*Nk)R#l{Zh+sd;Chb-Kv*=j!e6S-6s7R6jK!94BG9;0a7k%u1`laQnUywf{KHb zT7O+FJX6V%OzN?axHjtgGeeUwc5W3DAc)RcIRoB0xid)E_6y`i4G;g3_-0-30LS-E zZ?JAmr(IgCc;tQ1jSkmz16LyeUWaXj@2Z#4X0dFyNVS5NTOG?-t(o8BTEdNvON<<% zDq-h3GDn}TZCIIDdDH(4C%>n@r0G($sr3}Ea>!6*F2(;rgwTz~yhW2lyEo}0rxy)2 z5OR*5L$W`Fd%xWO6s3j9z_f1e!j6nlxk1M zOUAO#i9+QY0F@r(Bi-3vh40Wmquc=dBWhfHA9`ZL@1J@2Vqa`=v>>qJNWH=59zOkB zrkBnQFJT9Ovj3L)92bxQb#w$BFubSx1W9MviB(DQ7o1~Rqu9si&Wqcf3ZxJdYCn#QManc}*<8zTuNUwLUnTwc+SEtsAZZ&L*+|*C>r;o48^(^9un--zYqVXe+|slORn%(+4?kSK+g8}l>~~y? z!HZc62mnuzUlVb=oWy!UTe3dK=kL$D;1x@~?!ghlamzQU*(NfmDc<*;kKIGWK0hdh z4c5oDMjrbVsfe}|C?d)1(6=iWoOrq8LwL~e-qkmBhi!L z#K4`yDzpwLR)$ueMPIKkatzn%()94TFZa}PTs#}8K5LM<$Xz;`e`a+xY5;&NGk7sH z^IE7|XsX0~3z}lC#`Hofed`{eJ>-p~-9}4oErq#4;c;y#B87Z|B=BP#`c$NlXG&kz zLKYQNQ)&Ek%6*fd0%C_s=)9alaL%m*g)FR@Z2UrI{onmEk-C2B_u(`6bpuH#)))n; zRTvNDGz4=%m(FT{Crkhw()o5`MmgZu)^lxt$EI^(9+wZDm}W$?Lj`0xOQ)D2nL7s5 zR7+<@T8SD-y+f0HB{9Cc|H0wfF5glQG4BXY|CZW!5*YsK!Y5jV*W#JSk+;}pY|DRnFdwUND3Yxj<9*2fy-H`z zIe`du=yha+fwd^)s(I&Hn!9CfF(rpVx?yo56GVTv%a00_e7am;%n=uwhd$Yc7D2Fdpkcph~+&MJmFX$rp3@DiA79@p zfzP}OHhqWaWZ-E|$L!^*(fw4W@Lh$8`(c1!Sjq}_aGK%X4X~IHaed%}d`&0B?jEpVZe#@a?aTHC0{keGWGaFpXT)wlAF75iP| z2+MqUr1$W+cp|;Z31?H8bMjFLWqk#-Fo-{rr(9@gA$ip8CAK69FKvlV6g*hIfj8G( z^o#2q8F^p)sx#rMSb_lps?k=l@aAqDWR)yhvN<%7gR1D8i@i4cD*J5AZ#Jfc)k?fm z!dF*7EHe{05AZ}T!W*0*UC6-uc(EFIk_}BnUnu>LN|R6B110?_H<>KyfZ?2P_d6G& z5V{!ez0Wdhu$^bm9nm&zBZ}MvjNf zpbANf6y)l+byjM7c^ORRZXdOrJ}>b^YRpzP6D9K`{Wl12kz3gfTuZw?)DSs=;a2Y~ z_A>8kEDYrOvNYoHra6y6MX!~t<(+COb5iTGJAkeL=!IzP@V|hZCVr0^bqDk7d6YhM z4aQu@ysKD*@^C*S^0ccBtRq%m7q%?5_0%(e`=G$0+;yYr>RS1&=BKBz^-qu#rYPMx z8p~v^oFp-J zF3spF=^FRKn9=0d@jyh8uzsq0TE;(6%bE+(aNLlp>qAm7Hq6<%?h_)U!cqMEM&6Z6 z=>>~2v>S9kLn-`1G$bFvmWqKwVvC=j?$78E4+JFU_wCI$_0-^(Bo`JA2DAmz+iIN7 zxKDI;w8wk8%lvlj_8er6ss>775XuzO=O?7Ga;yvfY(V%5hVZh)MgkG+bDk}-{3aw$ z>P^22n~z<~AI^voT9%saWRXO>}QkU(gY=ep0Ey+$%K`Mv`u^Cr^qgVgt@%z644%~+?V96MG^KhR| zeSG2|mY5z3B-{R`O2 zL46sd&=P9Fyo6!U=d_26D&sBJn5kt4HI7#8b4k&~-POkkZmMh16GQn?}0Dpm?n-GDRVZl8A z7idh}d4><1NA^Cp+l8Pf5 z*K?ri2is`!Ad)1DavrrPs0a`|Fgv;?_+!kJ#ly&|A(c{TokSy_1(@b~;h^kfzcT&G zO+UCG+DSS6M0l8MTf?K#osb|A?usWpoz1V>-#@Lc>ODPh4A9P)N6=Vjag7jtD&K7%6j7jPdg90@(d`s9>P6opM%J_5dD{3k!BM$T#H$G5;hdmQl*oby5` z*W`{%TSVEUcl!O!!KT3cy{F`1DGAI@@zL4AqqCHgv)zKDYnKRdvn_tbW_wJNah?I$kxgx{J01BcxLYJ7|_3l=D+|(AZ&9t@JJjbLvEOA{@jx0x!F%UK1Jy* z-xegES>(MI_{cliMMJkzipi=XGsx~N^N1)0G6He(BdWV(gs|Qg_o$=j?<$=5P(g~j@LX9$N4u5`=P9*G*a8^hy+^PS%9~oO)ygswagBQV04hH4^~qSG zQ3jiD$Jn3zKJ25phm5IX6CIv4Zsd<~Ihj@3BWbT6(B&3M*p+xgs0Jj2yqXco{o!|I zFd_xP|ElYXJ3F(sP5v!^D}M|}PJsluBu=`_y^N~|Ub-Nk`~u}?*gzylmoIMp@hk|G zcv<|i%S}2)c=ojP$<>nVxsuv z{sHy%ouQpO3#zHBP_yuX4HzS7COm+NpfDE1`bW(v0oRQ`N`?E4>O5W%|8Ta~3Q=zn zWTZxSct}#Fv5@c*wsu8Upf$<)r$J+G3a6^|>!W*vB_S`ir8PcGG1>SB-+CBze_0Gl z*y>K4hFZEVq)#%gbp_Hh4^-r{BIw^K&FX)3go^9u2u@wPGPDASfgqMdNy#@|STo|+ zIPKutw)A?PVbN99Tx+Z0Zhn}Xfy93^hV-v6Hx(b?3-l!091Z+(SuLemX=9 z<#;JtpO}!m>cTc3kz+)pVVMftmL!JpLlifErS)XG1_-~Ed(0#?WD5rLr z5D&;mBY2%!81{}J4Z2lLD9hCQH363-NZ z&vjpFllj{S6&s29sTY;wD_>dk;sMm&?z5wyW!k5pZ=!M)QO}5<((s30J5o);$TdaD zFD%-fAJ^Q)ekEFTUB^2s=PLZ5g^~92v_fy85M#Gkt!cI_M zL+7VV2-1|vkB0T_v7tD-G)r%O>Azoz3`Hp^-Jz5(XT@8yizc!S=gC&CL%Dnj(<{@> zhAKQD$TyMZEN3=biR6Ov@fSV^Ttznf`?g3`H$<5G71x2g9als~q@!eT&|^HKJOE8e zm?0VuNh$3)YSLi3%e2~l;mTe3=U1ySI#s;qKcNz)t4m=en(o8n1_JteQ1xW36sbg! zo{JRqhBM22P|`KM{U>7g>v98mAV!FAL6pV)%iukFUX7BVy9VNu6yBOu2w+D$9$i|b z*qDkXb?L2!JGSQidL^B=Pgq(t4MN(?tB%~AlHL@{my#@lK2kQZq*5J|K+{V9WII#1 zaAcAfq;I^*kv$YgK2v^C?{dJFfu#MBIP@f!>JXlY_%^7$f%?EiA!q4SUsYd+h{0E- zd0){V4W>p!#G*#*&c8bx83v}9=djJniNV;#HnQM!R|F)iQ3T_;jVdzzF#YX|sxGIR zN>3w$%eg|C~tDSm@5F={cibEN_z5)me7SJxe&Y z)%f$9{!8u=p_THik`8$qUYmr*omv zX2i4{XH-hRZNqj)a5b!Cd(_C}nTyRibW zIqX=@Z$E_SNcmuIr__7TT<^+;e-(2C5?zhYyV8*>k`b#7Sa!u<^YA`63&k3PX2|IQOS88#_Ac&+nd=om z<#*Wws(!yoW9+r*+JK#%-OMYDZ@Wyz?@E2Et_J?;qEK@Klb@T&u3&rB=quC=UNA?# zLZ`XOd23Gzt2V7^n_E|``$@n4%@i{ql;3*Q5bkyI+q?JBsvWZ;E)V54+8&N{FWOjh z=oP8a{lX-4DdTZW-&&W&iFvUV-xjP3ciI|Xw;CT$dU#q2uwaD4(Yi#VL1hjnP?aE* z;2*pz_zR{n{X=i`&4XYWj&#-jDh?fpiPWi}@ClK%95*@fquymM;&4ACT)su)B;HA5%6-0N8Kw~u3>2K5v51gKtypnt!Q zUr9pgG~s!Z+6hy$%FkwsX#=~dcXCTTcw2$5{gRA;D~g%MN`j2obU)(fnsM=&zP98s zQ4#f&cY9gF_h2~+!pRdXa;(D8(9177Is-!`>c*39JNICB4dBKo$G%8mqJFFo zK9C8-AU~*u3n&D*yZL8c;67vD`W&D;W6C$1{ZIYU^1mXXF{jWVj1}I6JUUO&XeMpO z>%+DYJa=BHhev3bX7Bg_%i;EdHwq~J38az^h$ z@gVh8w0v_n{6f*ZyraA3=N^Fe>-9d!ya5dcKBDH*v`|Ilc+e6y;39}V^)V9eVuHEr zMa``={xK~Zepg#{Sd2el;9l%ww-FI3VH`WgM_Cwsky_R*h35Jg$%KmS@6e2u^6{rw zhx`a=QE42Cw!U_?W`k8^ZBgc{%&ogq09+j74GEP%t3L0Dqxi(cNCq%=NHj2g&j#8X zaB2*-Q_JTefuXy4-?BiM!MyCi%zxK48AI`(3RR(n&P zacoh@8NLmxkgvJO>Z%byGT=y#Hbv8Hx8ealKhsDpW(JXWBja=M|zpmguL z>^Y4a#i<%=_lOUxqg2hP&%1&njvRb(O;7&<$JCE7oOV7Ym6z7;?UA)U_lm~_oMTw* zHcXVZT3kaYpYT2wIqTAl9DqfIIx{8qK;uff7f8~|0>1SXEAL+F#vZ9(NL5Lf6A66^6s$=%06ubw&Dx!GR|GQok*KIic{YkXzAQLO`oaZqhY zRZl4!uIVo@Fv$%~&gen?pYv%DgIO)Uely18j+(-TXW^Xxq-eH1Pix2Xk#>ua5u5{n z$9ZBG6bQ=0Fn;vqDc!q7b{8-_V0GN=O!LvwmGx8A3TmahEBx;FlCmcHr3oJE=$Ab( z0on%_9yH>}2IWG=yapNTzdH-gJy;97zh+m0Y;-tlVi&_;sCz1nr|oK8fi3&N%eCtlq3Fp0=r(%TXUQ(vb0(bDHBR-j05X zVz$N^dJ%N2Mje39hD}I}?7(KEUl$H&6h*u36vI0Hem7|MjQ!ttI zlW?NXg^xD5e4L0)?2?>LQy$bzKsNc}<$0>iKAeWDn-pm1xY02e0uW%*3>#QUSP}0Q zLn{;8dL%*zmnBIr$AWyj<(qsWC(9CxMm>YFv@;YD=)t861_s}Zfo~X}y_Ayk;BjJ# zB^XUSzK1RM~IOXfz^ z+{m81fi?emx7@If+iCgc9sW02BVMPYF3wLt`T=_xlKP~;zP-3eInqHinl5}TAk?VE zzP4FGama5-MOvJ}<`$3ri{lq=aYw)72hhxAq4ht2I8Cb1wP6&O>fR#>kKyj z{9gE3wi<_pk(kiMk;|j8U!p`0hOsGyw!7}7X!h42UF`}=$a+Td$L28W1~u=KC4V-z zozYi7L?*SySiHH~0CD;?!%9orFyax04$V^r=HQKbmRtyT$}0VM>~7BcY^xU8+1nka zz8;pLj0vjige2U}d8z@Z39 z!`CK8XR&NnAarmt9V@KMN;<$#_U$pFkA2d;jzIJ6HOxp&>zgy8V&V3PllOrBJm~Frm zr32?gl?`-E_U$SU6=GwRnpe1;zn%RaDk+&uQh&R)m@GQ}B>Ja9j-;`8x&hP{hC>XF z2iQcy;eV77FsUvKzYz%!2TK|>uw2FH9K$({?^%r#RVuPkcS5eMR-Ma+$)03XW~1B# zC+i|VI?IEKwP?DJ4<>;v(V&jkq}mwEz)+=;^r^49?PViYOPP01kownb z`;hz;-4ERY(0_?MzC}AZd%JZ`*+0+5NnQ?RZJ+8rDv6g(>$a-YeXjcMG~E|8hP0U; zDT0QRPNa8>QuI1W!^saSrL+4lwAlItij=3peTUdv2SPKeW5%G^EoDbApuN`MNfICL{n0UW%DVxi{I$?t46C=pWY1po0gkst z(a6SBXVKJq@^di6nL1b^!HVwkdlK1@8 zPOeLOEKLvDEA}hhGj&C`pJ+8dN&kdes4q(~0h@@i#w6TeeS%OVKvG=Kc{!01EZ)#@ z+W38T*u7|3w^{m@A)_}^Q_QMF#2R840f%m=0*Jmv98*Ta@fmC@y804xUwK)6qb`p< z>V3+X*>N)efxR_nOM9;rE6H@tXc)l~`C$b{69QFN&GwRYSqa91W}$R__ab$t5z;Icz!a+sGD^8Xw%nI%=qvkO85^eik$d7fJB#_3s(FZy<_e?n161 zMbT%8bC0^ELG`Xv1Sj)8oitNJuQUD|de=*ggmsV#PoYE$J5PN~JBvnxZEl2lnhqM4 zk01D>U#+NCxy}n~?YSiAwh=*nW4Uo+m?96U3+GFop|^ETXl_ijf^ zk6G4&NoZ$n4f_brtIzdKUj?c(LGZ+2K4EQl;hZf}>dcR<)_1HAmPE?}+@XaCEkx9c zAjTUmNwEYA@PR3i!6zO@98L&^DE^81qh4waPoFKv* z!9|-cVFLYDCu3x8f_Gmzu(s?8>|wzC_s$mYGDGACjj>#;Ib(gRkAN9!b+;_VAG9aF z)kPBd@ODu5lfhRWMH{XwLHGDqR7?hmPCDou2Ut`nL@9>dDq%1_?*V?-%JZ zB^}`%Q1k=I)n6P2%cVz30-o1;YLSh{MoiUGV#4+hIGBz_sYFUpY^l%DSB}h3Yj|90 zH=EHjit2|msU1R(?#pi<#tsOC3Z9bx9p>nJ^4uZnmI-uEM}Rd)hA1%tWTMW2mH?HR zQz)tgJ);qldCjRDnp|dBL)G!znEp7gh<$~s#b%BvW=Ck6ZjONKUx|&BB8B!U(;2NP z3xBR*b+R}4LD&a?RffbfB$J;-u z#ofE|a#YFTOt$f6gKX+*_7xxuw}T}akyF94XQ)ERSuq(@4J7U#AU~$H<^=NT-w9~_ zn#{H1=+pDlmgVr!ptqE1kYq~E#4Hd3)Vx*d9^g~`loQNSy$C0 zAelx=bcQ}{JWUvtKY(D@RbWm=+q=2R{xQ$hjpzMw_X8^f-;nv#1%G!LEkFd(giE2F z1`k#@6Vdjh`kvPyV#t?Z;wY8I!%-l;1btoN6_G|Y{WDHG(gwNdL*QARo<-kD9952SFo}2bzS)ZsmfQ+qKkw!JNVgJ(CXxu`()i{VuJkbyRK0Xg^=5Y^=*X4sHU^uX!tQT;<9(*}xa zoqa=^X)li1er`0J7O*m(`i^wI8$&mZ*7(#FUPn1cB;!c*edGnrls&a1rC>8@ktTt+ z+lK6YaK$LG^t$u{|JzYvZ9DlS82Jg9j5m7-1?p4qdO1qWuIZ!0idkF37eT_DtK9qM z>0kbpK|N>J=~6S9d>|)Ks1`WH`I(vGm-hP1P9tKueSGvP9>umT+-^TB!;LIzrh#3N zc1hg)`a8h@asf*a+P$=Tf{3K3di6!f4AoS9uVYT?Uf$s>e!WlTUpy18>3MS`l;N}f z^^Pb#*qY5-l0rY6jbc(yffMU0l65`YJw2w?YT6V8W)~BzUyqpcH!O&lax)p;H{ky- zuR$KP9m%VAj}w)+mJt`Adu}adL@vPbYBQoEI&PvNQ?uRb1Iw4HTQ}OBXX*NBUR&Wi8rX4Yyc_rVbSgkBmLrTfCOPrk~X`5 z*)_ul(xYtghRG4ckiUSrs-M1G-Yuno3U>KbSmCkEQI;F~MW}nBTvq`{YAq$;$OMG8 zFN$j;$KagJ^`fY3vs*06wZD(qyEU9T?jykPK~VkzFE9Knm-)=-N#4YcYa+@@GJfphaeme^?{hQX9Z$;`W6MW85a;|12t!bN=#jM_t{Fby!UjzRJ3jZu zay-DQ(s<17e7bl?xj`kVE%JBUUR-A@kA_P8rW&x9CaFcO?n03I@%0Hb82Zcy^}WQx zw_|<;>a8C)k#0Os&b&mRi!?4?dlaHi?u#B`VZL%?4r15Z;`F(FA8zh^U44rf)K`Dx zEx6qG>#J_eU*K)l^|6exVCR&s;0ZBkIn#bs?>d32iio@LN%uZ}kyCkSPFwEnP|eF; z{eZgP@^K<=wv5D>YxNu;d;t7npu0)FM#|W|u;#+J?U1;OB(50}L*8;VoY4+wEnM?& zy}A0*y&Q>;bZy?LJ*4@LNxMY4(*fnL9F^@f&)LCVRd>sKzYFOn*N;PaIQ!{ z{CuALbrt@7Ti@r!1L$c$8S>v$UTErb$d7tXEsPZLc2;bf4K|S`nSQ@t8{p;7;p{v% zE;-dRVTva|naL<(pzi`VYbdr;k$k8eTTlZ37U4n~Z>et0sBn2>d?J6$;wbq6L(=GL z#bXuG^z3ZjvbQ=C*zp7|PLX=J!bgv9{wTeqo<;I$86U5Burr9_x|R92wFOH$Z}= z{000Jpb@tIWnLm=H5PGfb2V+~>9p^%RJOHH=9e&)r?8a-hO4hnTCyK|HLPb3$WrUO zJdThc9KV6BjT}Y5eq{*y^Gy)ug0%w{G^=SE|J=mzPT`DvdXB8PE_+RD&I zYzI>Vv5)!z?N+sHo_ycC?i}x9CJ(npaqR%3FY8sMT#4k8IfOSHhL-D|g=|yUe4LMy z8o%^p*?(7h6!W=l&x!t>XjA17{dwU@4g)K?FW|AgAx#=1ad&UPOli`LWIdGYqtxc* zb)tar?a}3O6cmA$x6jah`2y=&GG?OwuYhp>)wsa_n{YaQ+|Vw1LaUUql&S1RQlnvS zZq(_Xb)N4h0b9c})jf~TyTUG|uF@DH9{}I~M@B*aGa(?hKWWWv9TDSCg_`U~b(MK< z_VK@CSILwJJ=PhmCBks~9w!jyveSN~1%{Guf=Fg)nHsh&B-%@x>YS}t*Lf0m-+59u zE-~wZ%)Oxoqo+T=#tsDmdjEmr-+%V_|1~v1-;47=#vz)S`}{D%6~h^6`{*nA4j*my zj6GF&+yTyZqbE$`>lHupPZsGrmq!V1;+covA^iN1soa#HTz$d8mtrPe0tFf-i1~SX ze`qF+){^IBH9^oK?qEk*`1Xq;%!>O9qj&BX|3@PMJw5!rLntSv5 zq1QSZpEL+OG#W&5f}?|24qX$x3my2jON-!yG%cJl`xGO-aI=xSkIUd^QB=3^h|~`(R5H1 zL3t=XiV(GLOGXNUuYDsoQ4y+12wNPW;TR< zJsW~gxpzRZJL?_UC;6@-5{FwG0^)QAA`o)GLAg&{`N!)yMe-E58~%!-KUb?iSq5Ib zq?}uE%|Tkk0((m1%sr)bG3>ZpJN)c1GzL_Fx+s@{`h*e%(2cZd0TD@fv&&ylu_mo5 z@gJY57!KOWFyx<1FOp+qqV_+Hq5DMp0KF8N4Ae!c6X5g|c`yYL-DO!Ndu-G(2HK6# zxR(B1HYr(V`O&*}eYZ~BRYM}6At6Q$L;k{2S|{9#4zr-o#mUYz}Acq2h%L@F9u4Q^`dUx7m-R(hoKBxDufxejE>5~7%t7R`L$jZX z76;Eq@eYN7%hm4%-t`{lMrqeCxZp$6yH(KQpJ;H(d9Vnxsu*r@wpnz%$oN%twu0`p z4~-*L(lIY;=wnCRa-}3r^dnfnj9v@}T5q+U54C(Wu&|u-@{)U4{gTohWiR8tmVRc% zyc5_b4Tu4|^-P!sHJiqWwuegc0L&$@g|uc1V#AY04b}Z_dD|DD2UbW#ZGJYwzC1{`$W)SvhW{)TF%h&s< z*}%#|#VUNTKHY^FkLjxgsdW@pXcsf|M)Hojt{wyBY2Qw(sHR+PI*& zRUX38qw)#1eONveUoTCon;6nk`?;khul=^2nTPxZj+gd!^w8hTM3>HTzLd?SX|KOt z2s+4mcwAz)E|?s#Rvc=SVE5zq;u0<3Ls-cbE2n~yGzoXcC)qO0=~48_)TjD%=o_Hc z8ZsFoR$@s%9l_`d)bI_{B^e~{=T)C`i3Le6Dn@Kq-)HTnO7F&>N1QIDTZ7FX5im(O z-wakt$aJLsb_X&}O+syc1*N2{xsyMA%hfJgDk<@&MFVO+IzA|JEA`mkt`|`gBJTDQ zH6&r67pdM3^r~|3>o46FsJ42#(gqZY(&ck9>Pi_swB1PJPJ{f9Kh% z0#e*pLv;K{v{IH(@~xXri%WV9NiHXCT$XFHPSPcgorjcy?qVBPdM72&?4V`fs&W!m z@HZl`xKU10vHJ2LJwzYJk}w0^wL zFw_gl#$F>9f6m|Ounfz)a;n4T!dGeGoLfpZhLcja=_1u=lNjD^d5SL>NYd=XHVY-8 z#h!sG-%tHUa@o};tk~UnV*LKd;|i5P&mNYv=C&HlK{p*a183FsfQSS6myn1thW|Gr zapCPc`KM9s7R~EPm3VZa%-45fyFq7z`oHjG8fa$#ppFwUqi>hF$6+q5g)`iC`hjVL z(&m6BJym|6aQd+L??k04(whPjKxraUlp+X7lMX6fLPC+=1O!x6 zx}btcuM#=}Do7`Y5KuvC0vaAAiL-z2%zD>*=bm-voSAdieeas{rz;pX&$G*Se?Ki9 zJy(u>c)uNeaC;E0^)UMAN33UHQk(OZcQs~g289ec7-jK!!y%N)X*kXYIX5wg6Z?sza}JkY8Nd<0qK~~8!bb8v$X1-pO*1&>m#3{6 zagnF#Yu{+rI^7ylL`Q4`a2J7h7hy309ab5xa(U(y?S0@YJ4Thcrk}35wuZ-I?_U?Z zG~>sqdju34LCdJSg@-%Th^zg$BOwUkb-ZMK_pMipFPUHMU)U~62=hB$H_+-DI?#Hi zhyzLXranPO5*7N8M;bE)(RWCQJ}s=EPcqiU_l0;&`HP0Kc)3&2^~{!bMFxE*c7L$I z7P&DWbTntMK8x&_v1w@T4BP>ozO~OQ#QUTX4nmv8J{6mYy3A9}=f`x|Oe;x29MGe+r#BP!9)egu%jO zq<|VOqw^@~QqIc5vb@3Gu&lOu?_=NgcupwSHRs{l%qkyTTs<3&NW<29rgchFBEZCr zI}dFvBUrt47$2Pp?9lKOZfvUJ3RH^YJo(~R>OsY)%=-gMpWvm`3NWgimHi%vokTHS zsNw|$)!Eh>yX1op_S+s3cJEowHtZKpBMcr55u}G#6ANR1I&+}q$1!4`p@}O~YN8tY ziN^VTD)$uRrHttfx9ZGySlKCgXV(APaJ8s8D9=J(%=^!WrnH5ZN7L|&eGxl9lN)6M z8&5B=R|O<>USP_z`BEIJ_4?`u9+5m)T(5I|SjrCt{7bM{J!S@xoIXPGt+-OedRwd;)WC_|J_~vvq1mD6MAJ!w^9%ohmS#)#$0pD%d zLc^hV`jvubz2MQ*2dEo0T$oV}jAko|X<+9FMbBBl=XvG4(Tjn%s#DG?Ckx7Rf|?(| zPtP7s*8w^MGdHp)iSOls-P&Tq3rYYDmSU8}3{@N?eXS z+X66{>480XrpKtyw1rC`+g_Yt<6`1vZ_PTNz=-m`UEzC`!{Q+(u4fpUWdn|KKB-89 z-7(MX;0{@FPPnt*J7Uhe{zz@0zVw>6(uU0_?V;bUz8gP_;ineqFouXV+e+A1+>OlG zwu((EG1U}utD+cAY2ON~7}2h&^5JZ_Z7gx`wSi%_eo|31UFgman8*ysSID^r8IdZM zZTqXwt=J|}E_;&>ZEY8g)!eJ@%x`m(;`xoxLCS<-h$<45qk~3U`f6lsO6+vAkT8N z=@gn>b!K`$@$$s7a{a0E-=y^lJzncIehY6Qf4#eN@#opGCaTV}D_N zN#mK}^WGB2f&_#1Zf~!`Rq2enarGP(1Qfosf3cx#Q2>4JxqW!4spA=KInBa4Jm*(* z_q)5w3imG@O}~&I_~6SII;!;^`e@Ox_22Xvr4nJ2A4aki{SE{XG1V`H^!xrZ2y5wE0HmNVLqihRH@m z3Z9d#9=ri#aTe7`4Y8iQvLW#YRvf?B^L0Kr9k+VVf1y^#GDBB5;y~VM z0W-sgYCH$iYvoeVe&t;_R@RVDkEK^wS2LM zP9a%KAa|Phl!x69hOtYp_OKPF`Do_&i-mi$RO;gAZ_%gC=S<1Bo#!G6xr4oTp}4B> zZqKRh5X-3NuLe`!+szCce44k2h-};4UIo4gDmqBo@b@JYK5zin5Dk?spEc`nbM?a! zkga}rdyf+T?o`q4i>D3n?eN#%qPfYj)ZC-cPm(mDq7T%%%P2NnZ?)-xo?pW^>cakx zYYzRkm;-I|Ia=KsT=GKzq0O_~3+rVb_w*=agh~xjV3%sUcVe=N@w@BpHPcPN$%eQucqU}GOMsaJ+;hv~1oWUTtnMVvmRPgK%n!FsTt!TAC@IWk#C zLc_QG59~%JM?TU8xkfX(wmLGnquqh6{B){Ogpk#YFjBP=G+x zmjK#bv7&o-#u%kHSBX{=HX*EuoSagHN4=fyfd#5MHo;_f{jE>51#DT_-m zkw>rb>#wXj^FEBl@b1MeVtL{`mNT0akpJizCPF+X8jl3!XsD#onOGlIn8Tm8!n z=9X)^zyljC4+Xga(@a3nYi$hG2PqE}gw?ZM|Ls-*1JW%GY7WkQ zkpnaIRD^^`L~E@C9)Ys7&jD{Y?-BW$Kq5Qq1J9%$& zh2Ot-yDawb>29XjvWy>eg$HLT{Nq^9!Z?~Jbhdb-EwA2#u=dS(CxnsF`(loX5+U50 zQU670N%WaB67Y2D6O=oah4w*A6-r-mMnI!|ccw=B{0-DtTS82-vS(kuTauq*m{)1} z{Ao#MC0qCG=l??KEnKa`gfK;d5es~O-<%PebY@pvFe2CcUA=GiWfhy>DiAQBT;>_4 zD|F?bBhR4Q8^(O3{pJ1OKZm;E%<5(DxSANOaF{)^51;n*dRtw4DD|E3Ilp6h4zAYu z62^U;4PW4EKL3T?>c2xI_uv0V;L8egmJTbKTcGhm&T3`!7w@@kL|o~;gmIszW84}e zkDPynD4z&br6X_slYkTSAK1Yeuu4GXPO@E5=wB2KPrNkUI0~4{rtN3^c?aiIE6+te zU1*Lzr|`fcUV=A84h#QM^i1qw9%Sp23fa{j@O zgVS5nk%=(v4@Y1h^eazAXc?OG(7pUS)kOcC&EsF+@^A6Jf8bwqU;e;=kfMea%{omJ z3lDSnsAlG131uzYX=^-l@a|DLuKj@V2A6WC+|_0FxO>R6mGJ5RH-98{#w7zy!(INvBTeIa_f%f5 zoK~g#8S+os{%_f%xKq22HyCu`S71SAO1g};s0i2%OLLC6AB|Dy7nlBQ0XgCS-qy#=EiJ-|Cc8nLuH4H zj08OW+-q5w@l`-XF8Y|;D2P6cGhpr{ljB?K*M^2yqV>q)$D<|w^}_$nWBn)X#{cs> z+02EZNAulItTPmkJ9N!s!R^(TEG{v=J?<@xihTszMa%{+XZ97QJJ&1R?UbR|wAL3o z&Ak6oHQiLQfKJ*jPdIKN;$G!Yr>Nu74~zW!93B3j4sfI(ZQe|eH>Y@7{t^OYnu9v1r?fquWN@j*My4Zh>4eVcAu$bXTX@WdD6wo98hWLAG>zEy7^ z@@EmN>VNch=tTSb`0{oa94};@V^IZUYL*m66Y1B7qR*JeBY8Wx0T-E!;>3vgSS_yE zr@j~M*q!fS8#p1P@k@kJz}@9-5l1Juy&UACxU`V90e(kAjF1=$%7@VE} zhb@a6SFEi|p_Qh+7wQ3Uqz zr}uBj^$b2m*F(}3b?(lb=U&04Vq}5088b#6Gb|=S^#9QO?U&H&Y?}r>5p(f+!Q
mAu>1d~c1!p1@=t5HH3Nj}PS42i^y}ELW9Cjj|I81j`aX2eh&&CRz>bv_%sHGwt z|F+IaXorC7$((brB7MRzOiPak?5HjndLAvSGpso-SsxMk=!7COD)=`r5+K?F!}n6! zaf;c;4&fEt>|P&+d90E9ru@(O)>W@6vYte8%h|RQpVlgsd-Fx_STI##KH?h8V^BA+ zi813Ra-BnmwyR5!3c_xBAurI(Lw#0n)^PVn**J8sUVU`r(36pdw&S%&0|ej0qMi`j0eKNPF*6xJIZKh>+CPSj zV09K2Lfek?J83wDwrSQW4)vb+0(`t4=!jz$F%q3n=}WX2HJ`@pjJBNmAxc_*lya>> zJ#<1GFr$xsN?BBR@FB(SNPB`nbRgz~b2I%SkB;O5ni=XR4jd40ea7P6{tX8Y1(!Rm z&6Q#Ut=+K-@=WilQtUK<0{>O?X>0}x5rRy0X6o=D_L`6mdFAWKetgvsgCTzliIGkl z_)_F?ibVy{p-JO`Eyg&rl|m}S!0tkp@m;NEVrQ!Fv-d$lJf_LxijuikcHt&(qP5e; zm_q(_fmj15I7AC`{9>1oZ3b8hz_A9c zM4YWp8Iu26m$AHetvw#yUhrBwkJE6k>Vfqvzp(C2jUyZdETmK!?rRrR?30hb`&9aM#`r?v z<;JE+k%5|%N=dnouCwrdO)!Upha^wjq3~Q@4om6KAh`ir6j#R?AgKTBWAo@PjV)!4 zefT@(x@Sys%-gKxm2dZ7%Z=t4j?tk8|G@CJD~V|s9qG-|1F2WJzROyy9&bqe$$=|Z zOD2v~y2gl#!Ym~m9=y!nJ;^}NrF$Bv0IjGkDA#mTebV4Sjp*7UFGgfNY{=z#)T17) z$bu8dQy1#`g=KkdX6-uMpIaRPKBU{IW~M;|KSh~EtN&}y zczWAf;`%#S7v5+njf<`i_=MMD0e>?^vwVNEzBoMWTqD$8?h_XJyC!AZO098XvwG3J zu7f3A@YXRFQS00|DRGHyaJ=#ANMjaIwdyYkg-G-@sI(NA6o^$nmnwYjU{G<6C+lrj z`|&KEfDAX2i+&={u53F+M@Q9=+E+XibPl7D)13+%h2L%L{Y&2a24vROc;>LupS*eN z&77W}Xz3|Kx*|k2nlli*_LI~iiX>vd46zI(MxCPH@qNJfr=A zUmD;@d%y&P_iKYAj5dug-0~!elLH{*;A+?P$i5~brR_R+=;nV96VBdZB2cK`|$X#-p$USuF<{M z_c9C9dj$qzNGIAj8P}Llfk^xjIG1Qee3nE0E^X#(Gwp_ujHY9g<9Z!(JxIpo>zTLL zu5;@mPf>1=j3_+9WZjPd(oUr*LhlJXNi+cm$iFdN@%zm#abw!2n%P z-v{<{k^4LQGJ<^_&+@^=|H43|l8kn7wz}D30x*dyK)%LqA(j?Fgc~2`9Y71MAJbT; zOG)(2AAQ2qH~q^zQ)D_?1#M5>)vpmL6PR=Va0O-u|54FPI}Yf%sj8{|*q+}&VBnpf zB+>llb*pKTl(Tgfvkmf`E|MREFKomP6hTl#YuXwBF5$xqM4u(eop^+*b6)~hanFg0@1dIoykU)P zTJyXSiid?Cd?8A#dBHznT{H3O2jnM^c;L#Lpi514Gr=3b8lvpAJY?aY44)s(Rx~+g zf5z>!biyYlm<5v{3Dvzww8#`go6h)Y!%0TnSVRYtx1!m}buFIOj?VZjhoGvg;o)J^ z6Ivzouk`N$2LSG+{Xug)7IZtdY|CgPiw&d6!8HQ*;dR~HKg@2AkDnI3ncd(ileKMi zJx|@piZFa{9z$dtrSPbbi=m1G6~Ln6D<+bE37bvrsGSLUf1aW)~9d-!p&-bAyQj<#Th;lT54~a zu6+7cX){vgK~GP5OH8Bj{b)O`G8Wh>Dj*YIzr4Tsom5E3djjSUP3qC5df(ug&teDK z66-k+>2DU@9~dwGv=JjHIG{DgAQwiMq43J@_wGStg^W(liJuRad|NssU7pQ1_$al^ zCyOoQei?)ZDUOQW&;c_YH=9a+IF0izjJ*uu`s~d*#H7b+o{cxkR9_infA?I-M5MAT zJ0{zJo_l|a2a3R-|gVXM}K zE~P&EwDEhPkFUx83fE`z5cNazeMuLyxo7FwbH0Nhqtq1*3J*8gkGK>2(-aVw2WAGH zj!*iR)laAbot`}VLf;EZ&QJ3F`tRtz)C$tsk#vt>yH#nd;j2ry%{{e>K2h~3*~=vP zX7#2={$2BGW$bK9hLYw>S$8FepXz$hGt$El4^bWp_{F7skngdGR9j3D`cUkTT|o#8 z1^J4;DwOQbme~#z$i8yjr9zt;ed(|`2bprw8Kcy7EtvwJ)#~Le;c%z-H3FhGdZ`QWn6`@7`e`K0Jg30 zjo*Cv%TcVB#OY?Qe9h38zH+&f9CTNYWwB`9i@K7fi!5{|niq=s>cFTEXj*HTl|!cr z*|r-U4$dyP%zKL8)PhUatDn94#tS}%TFSv7UVp^i7#Zb1<*bKb`Q$GT3BRk;pdt8sqM0i`7xrr=2D1USNc#{z<*7{ktl>AFVz5{oRt2+YUDs@#3h?6bl`Sf z+uxvGScmIrDKdNUnZNdaCi$u2J1ke<7}NVgP7KIk#r14;1Q4GL+*+idsR>1`-dtYSV${EKP-t)R;Rs#kvIj6w7J@-cG!}0+-{%w$>Vx3< zwYyFG80n5!_p^dR8yDCRlVG}IEttM4^zI}&^LIoYgrP%;`_ULpTyNl!SH;+f)xqoX zPQSYro@XeQ&j?#CNrSKsKPc{i9y5XCMQcdk)j3AP+iHGGKUMpI9Ht{8EI9Vn;p=0O zI9Q0>QcBv?0DRBqU?LB^vZqp#b?HW%i4#qoEx}(smb! z;b<(=bkw}fWY|i79(^Qc`CGISYN>3@CkNB}lsmh-mR>V2S#rMv>7HvuGw`Z}K^_OH zAh`B8bc^60)`M*h+3HNimR|kjP3$H72u@fizAWO>@mc>lD`deGaiD_f@6?2jkw*HE zBHFh|ufyjrlcaLgpP)Ii{oV779x>BbefEXNFe){KY^>9*V(Zn)xUk(9F(_x{Z(hL_ z$%vAUX#?C3h9bmm@E~Gfi?yqLMP0}rtZt)Jj2;ki$NYhP<<~%wO&SBKN1sNkf_BM6 zwrXH^tSx)UX2+U|srUdr_&WvmMMo6_WHAj2edc(dJ+c`!zhaDXHllb=?%SK?C@bC; zMfta7L0rmh43?_5mImGd;e&xyAAArOfQrR67)68aGJ>QqBt(uebzKf;wWERo zfZM=dY+Z+yvsQauSY9>ZNZD!*^7RAy8CJh@G11@DbH&cwM8b+sY4E{*pzuxOtfP?3 z%;pYzPk?n(h6M_5`f$N~X&5;9Z|WY6rDKX^hhd_y-O@j)sGh=lWM3f9`u31pX?Vhp zMVrf%YpQv3m^fD%}5XsuD`q}g^s1=^BY5W?>A(N8Xfs_*+Ol%<5XWqc&tT(3b4WHpN%G1LPX z9*mm2E%(zLc(P_qb!Z(|PYHO%Wtf?py|jpY%-PM59rW}USiPWcr(yaBcE5E4NtUjm zW%cTcRUBNT^|m1g5dw1*C(!>>3rt+(QeDeC9a4R|)-*k}QG3e9)|T}?mv~$QAuhTI z7NKz$>y5y_1P)Kl1cf+MV|CucCZ`t|4}xOrSm}YH@t4X6G?@V1i|>Sucs!*)KxG;+F20cz}qn#)iX^Vsf{R1hxqe zt?0}FGMKWc0PQ& z_6rWf?b%-LRUy5Fb z`}uilIbJmfu5Ms6r0;-oR-z*Hp&VjCf2UL%iMOKnjVr5}v{g%1*^TD=pQP`<>dOH{ zvEi~OCD-6-Fgmg#Q95RNdz}j85v&EP%8nOR*U&?setP=1x<-NEhrJCN3o~)Qdy%12 zg~AWD5)%y?iTi14s3QWNjpLU73B_NB=s4VT8CSkF1l-!IDpChnjq_HlKZWxk3iAOt z1F3qQQs`Op54QcnChw?d?ibZ+@`Fj0ZXZ*O^(?o$w*Dx-N6*)j^%F3KId!Wf?}kp- zA@FN6Y0O)M$ekpx!Q;HBuCgW`se{e7zdbrUBC-wZAEr>T6Gt$rbQ{jZ3nEaeyGl@ zEWprG@#O8p^uS=#{eSJ${_P&pf7CDkzdoZ|a&B_o+*-!sBMIMPh!ch+wT!INn&cl; z(>~MN5(|;fdftE26V$!=2c{VE7S`aSTq+fPc3r@wDM2eTJX@y3A}OipWTgO%Zj4SB z@&9u~OecWafx59MLYqV}2M5wYaO<%K>bXlK9-)ppU+(ujQanwqNiQChWHoO;;b4i0 zj#xX)WPV^l;UO6fjLsE0LrMh4a~B$>8%?9sWdhWg-A%21BikbigN|!O+wj2;M~I*P z63u~IGLKRFtDs>-^Ab0lTkBgUJ2Z;%-py8X?W%}VmhoQV&l%> zXkuQdZ_|m8g2)ixiqux*+V#9a^a*LGeT=CLGY!s$OpG?g0FndE9%f%n_VS9}CjpM8 z)@BzXO7p^>MxIcDWk|ElHs&PAD36R@gGFiRBBe1C&b5I_oyXB8uc<|yFp9ny9C~s+ zt6%b6ib47BlKVFIOX`_s&->?QIr3hqxdZzm1p$zS;UdZPS==ii48{@`&SvHZReIZW zEog?h_ZJ=ACzR@}l*1yWx3EM6ph6=ye<}22GH=*lwEvW^&Ry`{{;F%NxnxYJn~-r= z6EocVua^8(D3CC+$VAa1;_il=xSqFn{-a;;Mq*5IdRYQ}dI}#bSMriF-JpxEm>^Mw z>_wUZt(?T=#ew&cHe&&*D6z-io5(ruW^Q4ApmuUgt~fCf+kx-!R{z1-@KNBPqQul& zbI9kBTtYctv;T9!5byF7IM1!&Dh&gX^mb)7bj#A7u2|MG_AX#QBoYIh<2q4|rtj4c zj1n()?3$lkOX0J7n8=awVy*9{^(8F}rnjBEKu8j}IJZ3=$bg1J8w5GGZl!ly&+y?1 z+^cw5u094YiNng5^i3MF9j;4^F?IgclL5hVJ}BVc58@yH$DQQ2bINw3T~W%Q*?94s zcg0=LQzJ;n2v*jzjgk~b`#cy=l*l@NXY3OTxNfS$J_eq`m037OG zoQours0Hk!y^D8o+?`^J5Z|h`M(xN81DTd&VXv&a7*m0{Tt*A6SB8enQa~-5iK=57 zw8MFUST%3loWrtCdQo_eO=HH4-Rr5bUzZuu#r+e9vqVK*Q*SFjXN&Ls5?v&c*sm~~ zPj01_QIsN4gQMv*!5o{3>R!n0+_+n|=t<$GkVUnLBsllih$O{~u9jXH%WI}F9TY^j zg5<^YkI_Ed!BmtQEgMrL%hsgUy~}~$R-@@H@j%Xp>fEf?e?@$8=jak0+}t3pIzR@* zbKN+0ZIckUpyyEJMN`@5RgRyTZkk96PT6HdR_aXBO@(h50Q&V(;SngT_xEO-)bkhL zZ9{D|9~;EYT{C5s2zoUxEb~s_dR#6r>w-S?5&|i_wmnDJe%lUWVNgO3YN0#i`T1%^ zr+1svkt5iS(Vu`MP2r{{(vDzmh5NU0ldIqfnStSY5cInaxTm{`0%q zGUMV8o3%w8y(j^nX?HFi=v*z53w3-9rIagM$xm z{z`rV%}7i?s?IZ22ZrV+VS8#)@ELJ0GI`c*xlO3)w^Xx)uT~$A$$Dg0dMsmbt?YV6 zBn%-5MI1=c7Jyy1-|vc8oabVi6H`0U;OCe8(C0OMG(Qf4Ue-N_x0b{ghmG7N0`Sbo zqtyVmdWdoz)ahQx#J`I5G@8O}IIb@=Y|j}QSSAi}Y@QcyP8m>5dTwpiwWE~AfG+}Z z0s9nQ^IjVrRVWKsJ7`^b8t&J)5M(tWFVTGTLYBEszHl8MeE?muAibxE5bXPQCj++o z!Vd(ZFrIdVTB0o&MQBKxo!iL&IK)xSUL<@)D>ckT>NA`vjH&dmNV=QUER=^o6WNZ) zJ{`yS3cr{XG9UkDM(swXXW_|Ld@_TUTpr|YiR|q`;ihOrD9C#|g}Z?jb-*m}gFj(@ z{Wml5&2+44Lj(7FGD)#HL;L=2$D3CDpl_Z6%hAffqfV3h#;@~~FGdAoCdKt+L#DQh zz=)7U@181cES}!y-UHdVhiSQAazA4gQN&^13(|E^z)KRlymPnn1Z@V(O*s=)0SRq| zunYHo7K!gDJ?)PDtYR3W7hWPqd;Qw1)lye1{y-P=*OIRWVk$AHm$2oa*Zi>)>6rZU zw1VeD`Dd1zCP~)E-QD8FzlD5@*2()TwayhaS32li@07R_r82156o*wv4%vHQFDeyL zekJk7CzEgeV!~$=jNZVa^Ez2+*H9@hB%hNY9N3y?tDbgK|$;?YCV$dBF z$u^O=hoBp$aRR85%@oEh}%zzKb&9++&ZGV_@Un@qTgn>PeYmZBSOrHMZwaH$DTs;KPBz-PgZ3xpko zdeq8JLC^%`uk^_*UB`!vuWPDTX6>jqpDe#Qa?f7f=2)~iWOo}B{@zo#8->sbWFO+N zM)#r^ZR{yWY7bZ6shSYhkT6_FgMhGL-ke0vQTFaK{iS=GU5}%q;)p6sxVtD%&}qD5 z-ix3w8|XE&iIA5*`abODYH*IRE<8wS?KJFU2H-2?Q5+6LFdwnZItt;b%Qc(6BrR~# zq206`eZ|D4$Ec4le7LsBc!D!O@{AMbn36@m2@l;HqAXbv$}1GICEugI#E8#A69dzn zFTAE7-Oh4ziT7ZS6LI^>7FuHXLZm+X-EEjoUl9kqZxIfHs@j(q@9L;;gz{TK64pWp>0{_mz(`$(?Fr@K7v&xb2`0a#~f(I_s61tmp;7EtkU@XCf?9UVR)I z5)k!*Ygm9SKhl&K-`pp9Zn@%aF|mK6!KSTy+W-BN-j_V?Dc27qt`86EXX{yM3Bl%w zf$5ZEB!pgKzYYUQ{z;8DKihZSt@xhztaVSS;3ve7#vb1Eg5RG)c_Zfb&d&(<+7ANS zz@a&xq;DtWJV^{MOoZcNGgv9X{c&P ztv$Rv=8Xi~A6UE2dm!ZVQ)6Yf8D=*(eG|L;VO{MJYib0a@mE7Z8yl;0htAO7(iNbo z0BV~;qULR%#Ut9j#ZbVo;R%u7O~t$=#=X#$x4pN>f~s`3T{vEhAjQ1-(FS3(4}l1O zq~(wEq76?Lw<-p#NnCNYcq{_kR#ml)R-753GYvmTVX486?2W#;b+3hTDK-P!gw9Q+ zh&1iNv$?m;%V9xIa=?L;0Ycb@c|I1BKNj9bgaX*qt#qSJ zIw`my3adMPDnw3s8>RV~{DB}9>-7iLk$vps$To)z{A;xl`76;Mw-v(Q!vMuC6_~^9 zpit9vbBFdxmkiX_^)E?WU&}bzpTFk)vi<2aczWM6nZsAQolxTijzsBH5GCmST*aaB zTUMn(1T0tgRD$ut{F`iLX<6M)gc8B&SOeQfQqes?NBpa09CDG4tY&BZ4eo3G05|`E`Z`b_kmb~ON1M%=D{lQ^5j%>WVgxLNH|Hv6YYB4{K(oS5kPXb?=4Fv*^>{ z4wy%Ms%=u;Pk0YsNsI|$zeMP>aB$clzuIeEs|DTZ4!5;dRk}5yrxnlE+WbG)SFQ6+ z3q{@Ky0+kWZMgPaAj=8QY$I1*-YZ;o_h9soU|&K4Gmwm`3hZ;?gLY8;(t#Q4Wm{i% zL&tzw2Pb;}>Muh$zn1HIv*y|sj1{q+5@;PL$3?1wxIZX_NcML*=8=D4@_3uwgCDlL zvs>C?90(q|k7G;&D58z|FEuaRrq={WYrgh_?kB2uXsHqMAQTilEKLw-kd@9h5ja_r z8Z40=4&(SiH-y@c~ zvmm6oL-*!I1svBN_1zf59~c~c3A#m6|KNX)Fq5f@;C7f-G+8v6bZt9QE66IH*H)Y#HExJdCO;xc}s;_gVb5;q8$z0WD>ZuV{2rtGFx?WT9{&X+2T z3Uc%+9ixY#XN2zt)9oWR3tsv>+S&X_oUohzDO;^i+v^Rze9h7E;`n>j1f{$%*Q?1t zr9r6jB88{4&q>YEc`3g)5Dx7)5F1z1DV9=&lG8bnavkxk4(D>dw&k+~=;1G^qyQp{ z=oBfCmf5_VNZkkU#?XDbpJGqP&H7XZD2NnD*JA$|lic|JZ>xx_sL6Z=88 z7W|)Fi~YMI^*{ahp|28+Xi`|>6$D<|5~=(h!#}3_GtgXldR!l+hRTISRs8$P`5zt> z-6&d*$aEKT0@ARCqE;3}eA*bB5pz46@5c3|GH?0?rsvM&GM(%q>GF3d5#I23cM6ZA z#R(&l_2a=>0;(IV9ui6Zgft_eLlLK~;ZMVh*fsw#gq_ih_RM_We76V@hg!-+?&?#( zwam>I`(5-6ECSJ+8I5<|Vm-i6yos6VYV1a|BX3mf+NaghPvUot#iyh)Ry`W5HpNe} z)v4bWxE?QlOBGmQG_P*@6G3d(=C4Xnb677y6v-M$XS>I1b>fwzv(A}-k~rT4w>{Yl zuS8s8F13ILejM-s=Uv7zg4LsQO6TVzw^qLs{QJpl<~NJj0?tWf$E11a zYFjWBV&49Ny;kT(aiWFCbYv(#M9OCkQecLk+svzgM&X0l*Zskrem=#bVM^&)4;O>@ z`sT+qS9KKR0D=&AWXgQc6E;_}v$VbY1$#uF*OY z0GExR4T%X-i)hG-K2%1%GYDVO5pHYws4YjBZqF9B1JJEk>ywU>s`>H)Sb*Tk`DR#* z?%6^zM3OHAEu!Jy<~^A)p;%&yLwKQe&ZQ*0;{}O^M%R6Q!!v@ZUl(GzpYj>PMXtkB z{tC#3Bm**u`%BS34&ywPrjtb}MvMj;6TP5J7U~D!N z(8TcU9BXe%P7uBG# zb(z#NzkIP7pBUF6w_Gs)tAVE@PuO=s-(jOH)q1!gFaqw#rDfC3o z&0bVAMw#{@FW-rA`rD-EgrtzcyO;Tj+1W;UlAxU_pui^J9`crH;xlLs#Rkyck)1gv zdd1|4K^vXNIsnZfxgu$`^YQ1AJ0lOJ)0+u8w=Yx*bO|?$WYYn>saxh}y3YhKspvD0 z$3HEX{8|&+VMQhzbZ+{}Twjl?dwAS)@MSG4}*1#3#B zM^Fn1NcMH1yS4Me*SFyW<6qX|`npH8v+ET%b!9MPIfvK(m3YhJrVBO`?80OnOeZ8oLHPx838k?AkHi}}T20BeDY}>DIlLl_&3K(m5 zw7RM<{TRA2as+VJxjyg95cQvZ2Fth=WlTyF88aI;EHBDV1fO*5EzRkf|4xZj`d(1 z+T>NZ&^F1sdVBQ^N#)u)3)FqeHD9sihrva$Njd=Q&{t1yfmg^)xd}NE3OR?WAQjyi zmlooWulwRXy9)-ZHE^omO^0T^{h$mlxGmWYK+`#xF)SH$@9|Cd>GsoM=*RLeG7*-Db5DX?l@eep9Uxnn;W){& zx8?{w98MmB!8*G`S<~8>Moo`Khs6!s%YRkwNjjd+=9d9LrsTTf~vST9jA{Pd?#WJTYEgHIogk zH}390J>ltnoKf#)x`4cCgsbuO`-j4>>-H>JFwNo`D!L0}u>urf7pB3~mMY3w!h&AN zdCk9uw|=kw4rIL!Ujk3V|6lti!xP1i?6nSUImRDO< zzVfxUw(a<&u&d{!2ze55u?UgqD1P8>x}kFh%qsVFkO0N-rTI?$V9y&L!EGu3@rz?~ z*Ur5ucjrtMwyAc17;|cl*z&z{4Va@`*W|1l7)1E@@~b=aYo2H-3)Ec^p2SuJA2K{AXFNlaGMcx_EIhl10G zso>N?P9raYdT(E^`P7)9^c#vQp>$zlJY$T3U4aRO#EUg}c=w&a1npx4GtQ9wMuUQj zlOfDoA$|E?pTi_u zwzPNeFncQXN@qs^uL&g5>!93fz{KXouzbhyW7NTdRBGwS`_`*pJE-YqNzWGak zRxF5)P+qji#~1)ZGp6j9>S@?r&czfh39J@r={&h)P4aX3#dVJP-jdf+7nDWlrGsXOIHh zMa`!!F%1@`{DE=maH91gY0o0ZPZaG*=}G!K)id|AKK+^=@7t^pUU~nT{=~uryal#Q z2_i+(SR9DrMlDd{%FZFQ76}{wqnrH1=B#*lHhwN5lwPwfS|xw(s8iW>A=nm8l@d0F z0VLDzv)X~A%iWyj0gp10$orwU)!pkKn+aXhAB2aFIk3S#m5Hz|M{D6`ImYoLWEAAh zUTKm3?7h<9{^xa1EF~fx7H+ilUU3}dkDLp$YnUg$Ob=x}p|qc7R_P#xI#D*bqRURi z4`IQJxkI{&yPU4kQ(GX~6|5Om1dJ6us9x|*T$*|)KC{PM`1^jb*_)X}H5m^!Wp%ZN zn7XryW-#=7u*H>3O247r&&FGX&W7h$G^bL>kLLgiIp9@pru<{|)PDmyqDnQm&26wd z?e+0#@;wQB;d3%MB8vBgWdCqc53>Y3C|*<tO{{-PW6LT@eAG)=SN_CQeM9Vg^ z%~p`ia}Mi5)tPf1b8OrvRsC1}cZZJKeOQ?9^jm@5qg(h}4e9@|tRBKm4$;2TuFw%Z zDV!@bW9Kx`-S7{jiZ>TAquj8+z?BT|M#BLR{sKPwtS5~HG%loUN&GYd@^@%`5kd{h zB(B&)V3(q=C2l7Z+W9)BPH77lZ}1W2!;G6o*s|l{*`JN`eKJhCc60$CYJOmi79&+` z?k{a2u8?T)rQvZs__IyP&%N~>JZup z;w33q5t&kd6mnku{#&fZB`D~bWBEI!PjWe0MsJv{y503}Vv{VG(y-lN#eNxA9X()Q zcmAzt!K4LdxC@9t3a?m7h+El0T;yNr`aO!@S&_wgJRYk3R6~-o1S2a%ld*Ng z)|qXrryQzlom^+&S#=F^{md!o$CoZ-moPJ6m&U~imJ0we>6iY%viAYycPugpCfB5-TI%O3VaAqZ7mc}obCw>y@!q)yNxbC9wcTqz zseP7s(a9%Y$^NQ+Z=j3z-iY{J1LGC8)l7Y7t_X0=qO7%#5g&5uT9aa^i882FZKasAP}{klfZwFi@L*{pU#@^6U` zoo6r~<`IJJ1wxXS+RK0)-pnA5+e=9M&ZYd9pXU~5zmxQ{B+EORHEKjoChrzmdsw{e zg2Pi_8@L;&UnUw`OrWgUp}zD6O!YCQ$3BY0+m!>8VJ2|L0Pcla#wF+Ph$a7Gjr*3WabtsOIEbB#rnGtYElXpZyj^mKx{c62b6w z()Ryg@7?2}47>i(kt8XTbB02)ce{u>y1V+zna=S|Y;fz9FB z5;hFv$W?{ED2evI$JDGg^5;^2Uz1|}&4KY0nWja#7)_gn&F3qwd^gzCLoJn`J44H2 z1az0F(q=1(Td&a5zHGGe(`;8#`sJB!32G3;owvze%gd%TT}ef`B%t)TDJra&@0ds< zF&w~YrzTJ!p^RpR>sO&XCsmpfK;O+eajP(rtM>Ee+5++De!Hkr(_Ry4xm&{F@vpW9 zCZNvxUPD!~3-x!x?u>NyD}6pf3znzQ4lA&Za-Nl2u`9`Ye9A-QXY_M??bqU@jjB3S zd5K{5h!KhqZ}Fd?c}rZywdgP$FEC;-XW^t#s%mwScy0{w^}vsrL*eQEp#?+yudw^B zINeU(>D7~o!sjus;5pe5%cM}pU2tB8Z(>y%KmB~jqn|v)A{<;kKei zTVvD>BLU}MgMqG`b`{d9^x;_YNOIb4x=M}NY$@9c$L-#_huWvC*Hw1hmlEH*P=A@@ z(?X@8fN3LY$rF}qVLbsxI5TP;HQm1us7qu3=R5Hi`w4k=8CuF!;G9y4!R8fL5c}n8 z4-*c-FVbg3OEP29o;n}0o3XTeQ*0v}xA)vm!4hk&*%nYX*u&DEWgG$goLHceE2%BO z5HpbO14&AFkb|yAa`ZR4)m*A1`ZBchZyr%H>kws8IxagUW<8m9Ro))5cCJ zb=fTT>7=dtQr-*nd0K74z*uG z~m#0!_59*zzxnef#X_%IU;N?HXf0#d?y$hy|YE5949g+7i1dzry}1dpnc( zf|GVqNsj&I^V<_pAbimcUH@HNGkKrv5-Wt+$~r;&`8gNPFeRR7^SG0HLZZ>x|8YMS zd;PkXxT2~kcK|Z_q7u zXVA{TJ7#KnAxQm0@{kpe%<7@~6dM_CjL);yO)-GCkt>V+ICt(B(G(DY`wzVNurtz* zSHbD=vqduX=4MEgdy~%AbRxonG|01R$q-HX&ooqu8+pQ-+%R! z_dWT0le-Ls&1JTPr{4?H`2bRy-2PEeaIll{D!3iN4y06qKsY=FNYAevu%u!M< z>q%)0R^JlwrAg-^>QGJMkEbX)21BSIeuQeG{U&SHfa*&14>--RRR7k0Ww|->+hp?9 zNHG!J#Kw#138wPE8+(D*2OO*XcqRDW8{kpO-yiMJRn6f6uA)-+%suh(mhF*N_Z+^Z zJ`uLw6@=RAv>{MkUzoKA4xm|99rP8Xq=W%Ku0T^hnW9YS(yNn=n;5`<|8>j%MtvsA z+ed`vP`M|!^I-GV2B!oqp+hC{!LOt@%RIE;Zbp9NnYWSpA^EP%?i&PwdMDilwS@XC zP}AOfuIur*EPXlgaI1ux~6e&I*Mo=+FjOr4)sRJh5;Klu?< z(Wsfe8IiSGu%8BP<9>DX# zUNq2EQisyAVF2X&Hl>tx7Eq76d7hndF=;IodlIf0s^n3@#j!)Tm7C}a(N5!fXAQaq zZHr(dsm{0t*+^ch6=P=5L~rZa6IMrNBoBZz)mdaKtU_y|%tw`-8Y6A;wNh5Gqu)2^ zZ?3TxJoMwS;byv^R*dc@&*f{MTeQ+Q1BM1L!TSVRIP7<$C1nOy1b&BAwC*Gf+?MaR zRPdV64VC?=#*F-;U%)YB!5zu6XGcRW=ooEQxe^>r8|59mV=!59o6WY8@F`X}hmfSvnrqF}d^lr!=_Ywpaj8qHkf>5D-aen*; z(j8--T)r-&Di4;;IeqI-BCT!e*>?-d3m0ESHAj@Zhx;4|?~bHzF2}%}RajoU)N5bv zil^p`tM@bN+H7p^Ei_Lk>S#$!T)n&cB=%c?8m;EZZ!WEyV6y{R@gwS^X%&x>7jC7a=S!#suU?1X54dfx&5=z^R{QH zMDQs=#SqPn@9!pTI#Q9&hpia?&@)#^`_*(lpEg;Qs@LEB+$+=Z$Ka6tkME59y0&i- zvI!z#w^aAK7D);=qwL}85fvWR0d`Ee49r{658gT?1Lnqse(h*U@Z7sCshYps%Ej|Z z#Oh;^0-lP!+|uViVol{@V&L8Lqs4*B-u|<^(N;tKYg#`GKgXy-s@E4UdAnGcZO=Io ze41M>Q2#xw$~?&0#wH>i@>N7szw3%mUif=YN4Fa{G7WAQG^wh-OAJ15Vjon$AyDsl zpk_-HmYX1nzeRfk>Uvc8Adjh+GdgV^jL%duJ}9^dmcj%!*Lk&^K;_Z}!79c^Nplx~ z7%*>TcYjM*!J?q8t(n@JZmQ%}HD`c@=kP!A+4P;KQA>?uY)zge=+4k-Q`am!247eq z@kYo*u3RX4y8TGQpje7~tjk7ea0FKzXqX4OrX&6>q5bbeZs1j+ti$n#Q*YSztK=0B zY(jKjzE8Y6KsAf| z9{silBqwXIVdZEzdt}fIcs{WOvD8zX*2iKP9aMqJpYKdQDhvxMz84c>f|aTly?6JiFz}83B+E!5)SY?Qota$&Jk8F;DWq zeG37V#;LuJ7~c*xT)yrt5eMqYB4gAdL=@uBZ09o&HsrBM`Q~8Iw zKD2Hg@6Qby8bunn`%G^g-*jbrNltX&rk`9NP|YYGA`imDQiP>mm{G_fj$1#{*wHn? zHZ^<3lYKZjUL_xQlihjeN4=OPh<}@nidI7#Sk|=KmizjVFgWoTZ5X6@)( zM=dC$Ftiy^$uoe262md@!F-@BY=N@q0(EP}>}}ZXSelc|I;x=n1yyGQ5(v*Val%e$ zqz=Hb@vbNUAzLHghnLh?1^}at2C=c-t-!G1Be9>8r`+J6b<}+cEMk<-`LEae*XsRC z_Wt)d8l02l0V0C~_N1AQoT>*QCENN)mHkT|r`uE#&y@tbhP?WAS#~q!s6?}`mh3~p z`kjI3mQ&nDow}y)-G-#vFD-f&nti&Sn}M)CemaJUI*^X;&Bnre%;y$8-8Kq@ZMe{E`_>bHo=soOrAb1Vk$8FRH)iZ#VxNu(d$IMg9 zzMO;CM2p%Ds_fpp^!OSlgGn}gcwq83HF6t{X40~HSjrBmmzZbo0cjjX0N(hMe1j$; zN}y@dZki9(l$OZ2O@qRgsAqAKcGtv{zbH4Ee4J)%e{rx`%(7VN(&rr)kL*j|ehL-v zpA|^{tK8`XGlVOV9(jI!wW2 zRAIE-qVZ>1VHSYIU3eRM4x4OIuY7cf>#T$9zVk81er?_X+Hv{)e=djY@92HO@}*C* zGB1JTtUGyt_|EqY=d!>***@o_5PN+XpmA1+6Y~txTEZ&QH!4bb_dYt~`4{CI!_->t zb{P<-Lj^SdJtB^$fJiP#a!xd%6WMn=uuBwA17Y$=JaOT=b}>6fV|dNX#Q@{+Hz&0H za}V4<@ht!M9?b#M$68iTfL!X{X^^b(if+L@vj-)_QH#Gp7s;cb@9XQ(UBW!|8{bV0 ziFX$Aim?h4zosd=No(|vWM6tDd0^*Q9N@+3sm2swsBcF-Hq_(pj}1ONIDAy@!f^G| zd;8FrjlSStt42)kRHjE`Z5o7EiRSahS%A;nMQR;&#De)4s`|+;B2SHg9C(%-_<Fo z2gEe0AYCc_O~5zozkggCQUZj1vV++Q)ri$Yt>^TAVrZYq9;7u*Wh@{Z$^}2-fg9*> zU>&tM&7Q)JZXF{}?Zezg!}h~PJ#0x4;C4Oe20t2>p!biV)g@#JL*L{zyd;eNoJM8x z;IF{-bVGYMHJxQof9V^E<2BE$zOvhm(fuyXwPbPg_PW&kiTaiN=rSjH?D^nYr5n;a z!n5}(5|bvioENSK*mIfg)xObVr}~5J3W>NQT(Hy_v;(#RC)Bn)_T>+UKKpy7DoOj^ zpD)tf*SoYcaz?j$9KFdmn=wHrJ}7`h1_U(r$E?FLvd$;Kvv)G@*b{; z3qN!Z@>`uX9q3$sd+z+j*CoUg?!~Hou71%I`EyTsO*5lE@QGQ(-SpLsoKeH-B7&@w z>|#X8SGuWQ9HI)ozHr2Ead)LgXpUG(X5MhbR9JExSz@;VUGB=mWr^03C%0;V;3lo};M&g4OhE z+ttL+=f31|PLMos@YutfANhk#7^K{naI8IZ(5iH~B^#=STAl<g2}cpF z!I)b|Wu&%2v=ne7jr(DHSQ@O=l%e~~Ys7^Qi_ndhHLWk8qKU^J^ZU^M`8M_+|FR}I z3MUMCkRRv^Nvm{WT^%&e6pP07Ki)~}3O;y9KsC`brfbHTWo@ik!?qRr;gDMD{-Q19 zn^WZ5xz6i^SnIB|c)AOflWxw4O=J*1!G^Th(M(cEL9gz?O&`yW85fkC{dp>O(Ns|X z^u&COV#D`*U*}%$@V6k}!OGE%-1@V2)JjB>in9}w{tu^dVIB47L>WK-Uo2IPtYdIC z(~Tux(B@5r7=ayyc8b&+9VZip(0mt;*POVJ_-SCNioagBCpXdSl;q@g+S2r=09(H=BEZ11VsTP` zJ%9J0f3j}+ua4)l`WiBZ@B_O|7yS+%WilSN&tGhhml}zxKD=eV#OO3nalKUw#i7r0 zN9(sdi>j)`(+)RE?fXOBwk!*)i@XmQr1e;L>K>uZR+TVFUX#PuZ1+SDc9sg`8=Zs1 z2CfBex{w%%S|a7?8D^xTzhgswi!U)MO@dhKXH4aJJzdlU={>E)hR#WaB^zsru(e1@ z-!X1_!Qb_7JlnFCLsSp-0{Uq8==&~W{1nAYQP{j6D7XIQ(}^{J@0JuHeGoPSOnXuF z1dcB2Q{8iNd1n5}#W6R7%|2y72}W16tS&Xo&w^ql8(+}QVB$-DFdgfO9V+gQMc#3~#;2LuQQ--4T8xUTC-hl<59Az2)@9{yGazZs$ z81OCPVAE`ndvv$_X&vd(s9m%24S{wEukm`Jjr)7!t><+>wiMI)+jrdruQ@{V(RZX1 z>el_H?3u|opmu40;YiDUX>Gbs*#6r0*)L-ZTjbsslpMRMQqc)z7+2WRkRy;{?^ zw4VFr%E-XbAzJy9(NyzTZRIlU=OsU6&-&hN_{iBiIxI$dQGJtYe`-g!rKvh0YrL;M z>cZAkj5UWPTI3^qum=BpF6@8t$69RJ;pO{Tm*Hp<%beCzJHO+|C~xfM z!RhYqM8iv5??{jLpdvo-`8CC#PiPU3qytp&e0&la}&34 zPf(Sal!HZ7z7I>dg*QTV#$G+S_ ze{)~IyN_Z4lh1%Vh5SGTf9VbWPTQ}MkFH5C!m7KE4B#X0ZyXp4?t6dohM_grk)B5# zCkEF_r6;$+FM>x;eoeDG{$OnHuvJ$qQmwG?cTGr7Hj8U5!Tl&<|nbjQS%SG5=_BDB3mXo5UYa-0xl9w+B%ATPkxzHe|va! zM>nY4WZ$mCv&?m&%(Fi<$4o@L<+srRGGsz}=sHcoRE zIiZV#hsR@Iklk^shhrmfjQxW^Qwh&vEQl?_F@n=Bb zuse4h<){iqDI#5%*53}2LdS=fFl#U7HNGBWw!+bF}0o5f6t0z0yGdPt7M*r&V zmOl`aCZ?|xF%KKXu_HxrYr=(umP+!1Le{KGGx2;9;FwY&{tU>%o%NHi8I}M>JhG(1 zls5#me!twuHHQ%+bC%h%?zaK<@Xy#|r25#DKTp*aWpskqSAymaUIC$0^#GrZ;O^^! zLXEQ?0nQ)RlQj%34`YEpLYG0z*HK^|9|O?Hqf9(b3TpU>9m73}Rn$Z_EVIR?&##It z?)nRU#2rHD8D7{8z*yz@G>^o~tW9I$y$m#qM zYqsAkc1^C_AJpQGo7=TP(C(Fd6v*HoX0D_LQa_ za(qb-ha`OeQOcgd^cc?0-zP4Zl%dyAmxrDJSS7FL7k7_T0E%#rP={GC*^iym3t)AV zrN^LEz&tmQy{wNrypHl+l^IeTM<4hT@Lwd}$1OllQ`m_`V3<$t3_c(a@i=xF2rW-9 z#I8W8`U9j7dWJs%1AhwZDK3gUI|l5c_9+F>G#Uw1Tz-nJj(v%}PhUsP zpNDEynp0*CMIL!dUO0va%j#4IfR>jY*$aN_sM?flZ0W-MiZ+fCJ09!8ehpUS z2O=ln5my!HcQ(Dgl=HhVU}`l#!#=euJl z4ClrjtIm~?hj_%!mhmUXp^yGO062fR#f3CfV>((7R-`Xde5gDW=DZ+lZ_h+m0acsU zYj3vaQFrat&7|V%?_ROqXYF(X?U()wl9>&1rG<7$`R2ywu&#?!df&Hi$+P<>(BHc=Z?|Y^3URc;@^y~VfDlGXre0t0i~AP|`+}IYy1NmhhmMZz ztCig3sIz?=>LSY;9;RiC+zn6L1!s>1V5=DkMZFtqU!~oZI%ZKPZhA2O`8%&+A&xP9 zF>pI15*i5dJx;AoaO~lyi(hv~&RTZtSI)hV_5d_4R|f>Jr_z0DQ2|<8TYnr)`9FyS z_x>Rg1Pw60Fz^1(A?DIGy*)B43jZ=)o|QXa`k_PVYN`J%f0LPV+Yi4|SNML{^@F3b5F(~p(7L>Ot|Ir zipK5D$~V+cX}jCmJX?}>-`~49D_1%2Rt&Y~p8NkmG5g;Z2P5eEE+S_?0QXJ%0#IPO z=Q!3PfEe7Xr^6cHoOF$)%m_ByJE5yc|8GnAqbqk;hRI3M^^LaZpFu_`m=+24bl z${4j~M-@YBTz>V9X`(b0#H2n5s80@nyS}s{Z*0MiyM(W!B*};IH&oc81k$3;(}kFe ze+GKQzE2)c#|MClqay627w(HQZb1nE-@({;#s%z(ub<4!un^Qxm<{zjTt}@bXLoCZ zBr_-6ss@`IDOZ7(UY}s$9zx@umhITJ6Z)2F>nOjJrUkvCa{I-qzgR1L+>9F_fI@)* zvBu?)6ePq02ii`uBKNK}ExL4Ffqpejt`gmPI@2QP8o*Xw#eU&M8e74IQ~Gq3PXk;{ zmY@!lZLSJ#GN09qHbwf_#Gfalx!8r+@hA*)s|_@D3g=LXT~NnS;-@FD^H@1TSkIhs zCuTbySRIM$s7`2(2uy~|y*SZ&FvVE7XF}%dcWmGn=(yi`!f|}VUm`Lb6kOp7E4&;a zf_c@akh48~G!@pv350vKX89Jh5(d0C!=-&>2mQxz2cr}`IR_R78(PwWM3B-($*Y3h zOSrYOnzvO}Y*=KqyIpY5!jV6ALn6`zEqEd-paYaIcIoxLH9nEwR?%;6%2)Zt`@zD6 zEw`>QMb^j&{l!vNUV5_ zAb3qWfLW(fud&j|^eZzhzgVU0?)F@j5HpAJPC#r zEOV0)D|_ijHv4PJOKcQ832atRHa>pPIvt<>XCpM`K*ii1^Ran$$P#J72lwRX@O)u_ z`P?sH#3gugBO3#Vj$X@I)x^SjR+@Y@nkk|}eqQUZi$y?kK5@L|I}4ao33mLmX!ia+ zGwkh%$zK<}LCt6X(9)H?=_(yO979E`!g zYNK1YZ3IPd#aF)^grg`~k-6$r?P`nE6Q9$5X!1MM7Jmy5LydD=c5FN#v$K`k?ti#O z{@YH}Advw007 z)-7m6V?2LluRd5Ger0%fL*=oZb`Lg(iN!rF;CWY_gU_kPZlQuRKW25VH zjj_QJ^G59$TM?Ocr z%X!R?dN+1^>)VUybi6Xt(B2J~Kfv+^{b8lNXq|mL%}CN6=Vafb_B$?f*a$bBIQqz; zY~J#jWGiG?k9*g7hZ?1m2efImv(su>+_cDEbQN#gZ#y7MllwShxM^UBWPBU?BBLUDgr z1~>144$wn2s$6o4$XRxTZEMznDr1Z*@-CP8%3cKpUB!$YrhrMEe!H}n~J)@v71slp9(2e^-B##Uluhc~H{ zLiG1_Z;?yz%u?+iHksTK^i=b!sAtmVq0AtM$`6~l(+OSV!sDq-rRg;O?!wqDQdg^M zp0-c>oQ)d0aeWu)xZb}zuBq_W-EE)iZBcY4aEVq%=Oac-=o-MU?wzl5JV0~EN!a1t zmzg}kJSrt$Srv>Z|>Z71HhNk7t1IF2}U|2r*Jhx;;o)bgPzqnySrFewTL}6 zhYqP4PkgAjb1UXu%`T;fTOQuxpbPg@TwP8Yth2>`$6Pb_9qwRhqw9)wtQG zF;8e+*nGkHiCLWBj*A*npLb#cSjuqa7+ILL7xblLnZYveeI?Ge^;7G^PPGpWIaF>` z-02%(66D6?d|p!`W&U;rcZ&>TiT$F??2<;Lz9dpDt=h>_-L7gJaL-3p)xltskFmhO z;nUbp&KO(Nib@}jkp*UOW~6kOAND^{(&l|ie8NLLK>C+af|;q9kBnlVMqTg4ob#p% zBX0mb*&-9T>R(OCu2?2FM72ACV%7y>)yeLgS)#XrNXo$89?YXV>?i}St-Mz{O+_Rk zf6NI?PAJ@{@n5VPEJZ-izB{v64rC)tKIWi!0X7?HHawMZaAI#;Zsa@u%IKt9Wq0-& z7ov>lI<@`Ab5xqj1-fGp#r3@!dBap-pDV3};e5a#GQNsb$Uvo|?iQbZ#IdNK0)_@$ zX*FLI&0|Nn=h{XgX?KsT+S|GHo;RnRGVIHtw*IjO_TpCWfgLh0Grn-s}GeC{~7N>aVHA( z)qGsVUJq5Y`gEa5*h%bSsp-BG3AgwT8ekM*&=nMy%d%)wgY|g`--Z1sZ=gc3>p?G7n#Lu)xEc~_IYTA zjG6GjgZ-AbjU|rPa$MP(dEw6G6QQWJ^8b{>4SkW`Qa{XzG!N&egq6$s&i!b*UUD_{ zii^v&cO656dY|?R`G;HF&B&7t6yCA;a66h0Q>;QY-=BhsqCyvm!p)Sv48dcKZ%_89 z@Qj;&uuZ9SB>X)1-3w&kY#x^WCCG3sXTQ8rj=ARN-Us{5Is&`c|Ll{Cp9cg=m2OZj z$kBp=`c16GEN&HWEDnW$jhE{t$W1-Hj@tl2G=lJ0O}h;VhD>Je`05B)tXYbXdKg7AZI_13am^s?l;DLtDmyl2QQeD)o?b1Ue{ z@oua0n)s3JXd}3fd5OivPAfa(HZmXSs8H77Z8p2HlM*R)Xmnh?)KvD`K*mvyLBWSE zy8e-<#bNl~v<>cL3M?I!YoP?E=ey8-Wr=MN?+yg;Bo#Zmmj7(#F^j^a=rCcDwsW79;rFp zG@^L#26T(OJNfd-93K?A0P7UH`kRP@p$NR+sXPO5+;-ryng?Tqk)2aSLBcMWuil}1 zNx~|>NaJFez;U}<5ihW^17gqK+%#C+G4kvpXVW_BMoS2G6Omy>FUrjI)j9GZ@kqzE zx>z5sM;2!8zEa0{CEjouHhc{+-n!$VIBJ5x$vQ>%te%JEjEFVSXWguAT3LJF9eyIa zsWs(o!Yp5vHE$MxHRwo&BM&S%8hoQnVkB3FX?CXL;RD%rHRW5YoQXfL zwjUP5?9>ER@}-1caz3VN9)o-@-&fS3%l*QwgdAGnA8l#)IWI4=-|U`8hTFyy4H+vp zgyUYU5-xhhZT>g@mUR@I@(y%!Ou|cK&nBw%$0Q1sNY#9sczWjAPtC!I@w{;3gnRPx zpnHtq-(#)`{kILf|L{rfvHDo-gd!SXNtv-oVyyw@%_Ecr_cKB}v;e#;LE!kE?uM3P zY*DOv7{FArW7X@Z<#A#+S*isZcd}y{Gtis>X4JV4`U_}(T`+(5$12uSL3rAL!Hpgx zu_0ITR|ffV19{xexD_!NBLVCU8iNw_L)p2|c)$K0sJ9dxfOP8|{1Xba0k{_r0ZVFy z2zG6cJMH8ma@;qT4t%>h5KS182M18~GfCm+pK-Lzdv%Z#-1Wmu2`2!@F}sa`k^Kz| z%dBFqfOqlrASJ>yhKjdNRP) z&X`a&UwMls%M7?rwCMi9@J67k^Q-!p2#BEoTtllaCUz5meqR!E$cF>0tdY5L=n3Qb z@0CV2onuT4Q7 zbS?h?t1WXL5tB}{L>3oZrUD2x0Y?CRW9%xHZHzQ7Vb{(k(ed73ehurWXu?_DoH3ZU zXUQegdfeq($Y0z8X<;S~YKWphOi8uj3FQDHSj~p-xDn`?xi!ELdnN}f zYb#)JPh2LYQ#@BCT9=_UGo4%WD{mkNG^v~qYTQl3vZMT5kc4{NCyc%K636NUa|hJ8 z^Anit^2(wu%Rd_IvO7>9q|R0R0DEja)|V8gTk!`)_)gdcH6qt=jMD>P6bh+V6N4B* zj72s&>nnB;xz4;%Ua<_g3`Sn!7TgHH)g?u%0#>K=boBmKXq-C>>Uo7-Ly@{&LHUt> zID)QUM|}a_n-KXhE|<^X&mb=WDsc^nEwhZh7~FAL`oePAHsP9WCat7#qx)4CJ+K4QKqM}tO;`<5p$wsM3Nf|;fWFNP_Z-u+37BRSa2N7{$^CaSQ8^}Y z1uivKuo+FFepq04T;}&63Ui?JsLtT##|C=&dyf$PP>U+CJg(Z3R?X0Z{a^p6oEV-> zRvuNXli51&_tUq@Kip9W_$wCN?dSdub;Sw9&QhM4wQ9j#Nu(LauI?g$9nUKyFU1T6 zs$!izw@yDz0sVCu!F0Wef?I>G`Hr=&-w$1O`_-vRdeI+pyJxF0=-$!+=Dh!7VJZ6| z-bb&7M>hG^v}(n-L!d$3!0`)yQXe(f4yL1JVCJYY?lce&bNEuO+%x?b6lkm4qM5{Y zs|_A28;ofm(+y`NxN`yA2}m?>MzoYUGL3w))#F7dZ!GWh+N0OIYEncklon0XUKJ#K zp6m|PCPxo17_opWf4Tg(5G7b}ewVBDf#5_L;AvGAKwa6)i;N8Ulihqy@moo zF&>=IT>+M7+x1qXD^8>1cE0B>Ibq2590L`vNuV+8Qw-t4lT!;pC;0;p0o9KgjRhG&Js(wXFfF8wqoXD0*7rAWO7$rFk4mE2LRFf~SqVdB3pkggRs^D4l-B;>^GoG(6oqGzmkb>ohdeew&Mji$jdzLHU81W1wqrEW^xb2>q_su#O zFY7n($bWwFU0_FCoLKKtBMN1@bAthQGXcZ?fOS`i62A_(QV%jk9DexvEv_Vs$CQqm zC?P#O6IaeXI(|4lcuW4IUH(w2`E5BO?|9p4&VXlfZ0hz>8`iQmS4N4}^h07Vct5Dxx;ODR9& zX7^`OsF!$o?QT^#S6s-DGeOI4+Pmqm#J8e{Jt7V-;bmf)k3QaN8ZysPZ@yT!)O+}|Dc#W8?jeTjII4|F2O6_Ba% zAY9rvcA<^M<3c&*R5zqc-@>ET!so`h@;;|_djov=MW4;NeL4bJ6ZN2D2$6b(9>u7k z_sp%1*_n$Y4JA+rOUmbUXnA=VU9Vt8#4Cb7`3T>{-76Pgp;fe`R1Kt~Xs>O6J`1*T zEPL+|R*m6Tkev((p5uT~+!O;raM1QGNy(V8A6+lB+w5fb#l^03zvrtni)s-&BRKTi z>K;pJfi^V1H*-tSB{#}^IPj0u`K-1TZy7mF^oQT2KH*=C5rig%E;Z!!>Rmc$LKL(h3#XKil5G2jPKo0 zqgVqxisXCg=yXIMG~R^-q08&2a7SI*;e5XA7qj|XI)5qs8r{f^+ih^fiX-X%hMZR$ zUTrtn9jHly<!t)vMc+h^(n`%n=K+sKON71AX-lmezaZJ3$!^ z((6u& z@sn|DI8Os9?3@=cn@BVgQtOXlivYf~LMdbOwk76sJFD{GuV254e)|0RQ=GBTL4`ff zdui`bvWXQN0{@0!I1q{6zp zKWq-7b^BYLeVb*@>9LbCR`O&=fY07^5CbV0_b_C%u~Y8&^LZ*S&$@r!Kzw}EThS>& z4;5+%i=$GH1l_^GisbO|(l{4K+B45$3bB zl&QK}Z_W4K4(Tmw7bIQ>y#x{w9;~~!Cv)GuVvYws`g=O}+}GQ(F@#UASOzKJU^x60 z>HZJi%HI}E`|p1JKd)bt_F|d>>#&q%4(tpa8(?5*feI2b1w_|k(ZI+{fNWVlKv~5{ zd(ttzy5LW;ztF7Bvx<%SZ~hqgOJrEynDq!7LtbVBYg4hG9km;L!2@Hs${9l*&!`1v zJYupY!6*bG;|eDM^ME>x1Ucp{xP-8HfdO}L1Y)8tfa97(s#I-R6&uCYgA^8tC}7AF zvsvGNmSsHtGt>n3;{2)t1_RA`!~*Ma-bR!a1#HQY*ztg5G$s{rdcYM8w@fh;h@DCv zhY5RtA^YMA$bYf3o?}S>XC#vFk=3_sgJXVNMO^y5m*rpLPtnPU`TS9k0CNYs^!{hL z;csjI9B9@-w*1yi9zpCx$8C3Id}itZV5#tSA{oebru(;t*R+bj5zK)V#eGB#Jc(N5k$rnd}F9|T`mL+|n!tdZ^KDC6&*{OdW z=)nl6F%vFD%GFxf`-1~@OTn(jNk0cVC=N)^2C_{COLI*I#$u3VYt~s-AE7|&&nSFR z2v0V6CF2_KuDaKtHMxGWJc(uFm(L^v$Le+_vI8+;{3gIY)iUbCj!@3qQ?a86n0G=f5DT2F&-p2g!k-N}#oPcjTEfhA)TIE$RfnFR z_llWRoF!8y2s@ryxsD3Lxb!>=aHCC9fvS{`pj9KaQS*~tjxFF=k*g%kq~_O<|fmBYiCIYb@E_N(s&(Mb+lP! zlsniP0E|QS@bmY(in`VMaxxQCJ{pvE?K!{a*5-yC4Kjr&Q>FqQOP>tGdZ7D7YaMsd ztiEJUm2?dpe)87Qt;5jWZ(GOVOG%xw8Ir75bOUK1s$3j2as=LmDdW3X?TDuR%6dPu zLhOTFI+g}Z_rJf|_qM-uF7klQhsbX?HMf`_F%Y_g4P0c|vhV9^(Nt@@i~XcC0!F?Z z?n+qT4t3{+Wg<<=dK>sk<)SrSphgY>Jer8039ZCJ>|E*y6AA`hcCHXb;kx1KJvjvd zw~v@!4Ryb08iPH!J3$M@y~R**(9O*;fD=D>oMrFmUT zY>Z*4jdNLP@nM;!M5p61Hsd}9FE-tM=%Te#5|q>lx>pYEr>V@5vs|S$XhkFn^^AJJ zQLB$#*RK?OI_Jh=b+F-Gg>*fLtDRzAN181O9m8BdX`#{HT%_=&)Pk-%rZ*&Wl(n;E zjCP?Yjv8LiZVsRgj-Y6sTu47S-=bB(Z|V0H9mCCm_FDcmekYGM%!;2ARI`nL=pk_M zf^+n#aeRQQxO3;=rU#k+_L)5UVG;cE6TA6DUPNI#Ko zD)v?2zzl$hK}Itt1yBd-@AT!SET;srgh{>$qfNDDUbjW!vrhTajK)7LN*>EOf6L&< zO$AP>HVV#1cmANAGDUI^3pWl5*VNvua?#V4J z=>m?*VVIKA(%-Qt+ka8#q%>o;STfSvhrX(c zxipO`s%O39j?Sf59p3&9;-XJo8-iZkxfxE~vE{?p_tx%*=47u1s*~N6bw;hJT})ZW zYx?}K>1ZsEJ3*3e?^EQrsZ^}Eq|1X+xNlzIlf@CH9d+6LE^2%ZG_WqAWTyJ6?NSqRN^npQWVzw0TwhB>wD0@iV0^sI=ff4amh4@{W35U$UPpeXmkl_DU%kS!tEza5`5<6uWjsgWpVONd@7OPqEqLp$0yC99 zYg>9x(1rmi6`MA$`lkcw(XSUzdVTwBb25_S0(_?DEg)EoA{Lla&oHfEpvXsxt77|9 zYY8GnbTyoFXW^Fs=g1w@2uwxd|H1jp=QeoCr4 zncrPRTJ-b%rVi?>r*=(!nQpJQ*QJ+o%ElHr>3G$m?hbzdEJ3S`?T*u+i6yLYob( z3OimWe{U`uNQ`)Wey>=y%8<-F+7>lIb~oRMcL7MiMPD;U8SGChwtyotE=&_ImUUbg zo=6)qm-F4>5iK=lDOs6p@bJa`O?AgdwnDK)5%M)0IQ^94rL=j_hpuUq#0BGf^)qv3 zq<2PM_@%y?8kSaM<+XaE-BdRIWnfU0@blOi}LFLd~w~y*f z)YGT+i5HxX$q%3c^xPMq%lH&TlSVft9m@xN&TJ`&unS!pr#B$im?!HQY_ zPtWe|TWfXv(&x?BESmx$fyIfP7o}*#0mP4u=GP-5J+`IhK1g>YNuZzkDpq|k_-RmT zL$QRa+vg-$$lYG?fquYU`vU?NQ5U8uaeb4Bg@voQk88Q@qqsVbUElilys-{`&FRC) z)B8yt-e2WYAoMf8!!Kw%E}({wmI^}jy38&Zf4HsbSV)x%bz<1RQftlm^()}TzUR1w zm}=LdT3u;2Ew*9wdwumuUc6kaX>Bu5d>;g)HtkHp#UIkZeW)P>_nKkhfk@29`p;V{o~&H>#Lh~9LM&h$#pah=%1Y*Va3Jh!nf zdI-@d#F_-iAm{C`WOFQm?&(Fq@Xc57=159ew7Q+6mSR=%LXMBTJXB;30k?7h3P!7k zcV$A+6Y%=^gnV2YYLCQ>{m8J6HO4DfrIC%}v}%3NDo*c$IsUDQj5o!TngU>?YxrFQ zWCO4}i->skkHpKl`rqrTR0sV00zT3|-^n(Wc#)O<>E?{iiy{R9HfrWNg&bu|<0Sim z3!O(*J$HR2G(Mm@=TvHHx%9IF6PVoT47%wT^&l%LLKtmpK-|5k5bu1Q~@?1KTe=8OyTjO$7NyKcWh4PypiCo@QF+ z`tQ1n>@c8Q3(5I{J^N>*9f<_{iFFFzi`-!iV{%C7Zct*;lv%{M4P=V;M9@DT1pCo^ zbQh4@HlR*{(-G4JbOI@kv-w&N`C zTkiHB{(PsK*q339`!FDv*Y0I}SJ|!vg3<>Vvc(VM0Z!Hi>}0enG4t2H#=sp4up=;Q zjO?`{C$9Vb1A$Zs^5^ai*8P5DDkYPVz8Y!q2gJ(a{Z;IS!!NW&)8s%w_@C+4JD?l* zf%@GcW!*X1@!fu${OYrP}?0P>F?YdHaYhB@q{T$)YtYgQsfx1|R-OMH;G5 z+rUTr9X(QYeR1zy=*D0`vmWC>8mL=PZObAv;VbBP=e5#NxCD9<^CJYi3%522poQUM zo3qX!VU7IlU~tNU-;1>dSe%S$oPe?Ap9wywBB=SDu7ipAxxvZ0-;Ffw!2i%G0_Vn7 zZs-zz!;bX^jzodzINJ=M2K%-KnTi>>8^*8JR-l^$)WhKCrXaVvw6^iJk+v-f*3cwt z4q5)=S2Ml0UxtE|jrg+)g7Yy$0>BZBF+gE9vvN9-FK-|iDStpn%R6w}`c6~+8K5A>!Do^uV(2o*5KLiz3iJ`eMH1mr*%g(w6Gq{j!ADF&==H84C@ zS1L0)7YA?SepsTw2Jz^_<)9zsU|u_HSx_+Gi}zckgSXB$c(Db$|L2q(V1fj*{5SFP zvMq;Ey|!sM!=%=fGoJuX&Sn9WL1osjeKKyK-J$|!@?ID15`bX3w>Mwac!TYRCu}kv z{=%o=epp{N0>63I)rx!y@Bb5o;=IQi10^XT8#~{;!S9YtW`bqj*9Pzcco7ZEsiKqQ z2xce1Vx=Yy^TlomEMQ50fG_0Y^bys3DUe^CEFNj_?XngTJ0zgY?U;`N$_07Em(Ktg zlM2Lb)j2iB5QyTj@9NoNrQ>%U7=k}#1^ju3Z zod;a=Xvef6&Zd7{<$nj$_V;0IY|Ds-^z`rLbOVONsMdhq_YHN1XjOY);O?td2eK-! zo;!K2|EfX^M}IdjH?;8~n@P=$|XZ8)Tdy-$4kVphR1?+4yvNa`aKMA+uU`*Q`vZvPH{lF+ z&Yd~1q=&qa41OfAq4~~tf|cRjgQ4*wXe9XOUl_)(OxCVAW&prlRy1o^oEn=jS{O$W zs0=^4h+uHS8KM8}vzfA*!7kYn2iq8$_p6|tu8AIS!qkWKrfkSjz7`XsGWQuhnzfD! zsBO85|J2@hRqgGd-r%-bm}4o=VY}y*x&1OS^1`m7PUqJ8-`^Z>`i`$K(jex=kXXtn zeTr!3K&!bm7wSamfXDWc9_gq2M_wJ26*4;NbQ#rIifK1)u`N08`YjSka$TIUHn5x! zla1%G5V>~v#S6uEkQunRE> z67aK_OH0@-7>vP=x!YEr$THiO{hMdlGZS+$Lxu?QUY6<4(}*7$uQ?fqk?FXV?`Sy| zjuDx;ZdlK{4U+konjf$i;84km?n4k_8Q^HJxJ7~e1QRv(7jF{n2xFvvHwDL`=wjfG z0X7L!1HhtUI!qPJUpQt#g%P*<0P_WT)5w*2oteMDe6mP60h3;JKlqk%ZdRw+*_pii z-UpL~>=Q>W&-5!yK0EVRx!n;AXR=yW%jKu9J!{t`a3YTf*P!YYwm}~epVd|p29lDaB@|rNusGryADIP zs&~X9&!0bZ_HyywVHJ6w<64udd$YFoUbo*Y?aiEGV`-sm>uIIWQLd9H>GG+S;gtbq z9z=Dxvaat8b2UP9Ap@&ic|`1*?yD$$ibLkr>ZB*SV;Ae4Ex>5~)$~@w|6-A7?mUP8 z5}l1>iD~rUx55wt;JRfQS^VX2a*l)By|3*HoZODq2ISRiTH?+h5Xvgz<&VJ~&P3KO zT%Ym3*0O!dQ{pRHLNgIj|$zajVrSDX|q+j3e+FZx>>5;6gHCMuq5Br4BBk{^T zZX;GXdyEbR<_lWw0EjoJb~CJJ@bkQFDrPU7*p@{}?ULai!=6q(oi@YD zSSG^P0Ekw`>Ks6<$iQhle?Vma<@4dx0ut|tYxy505afI45!TTO(shmfZPuv8fOgaG5WTW(^gX3jgPLy`2{e#8SDKw%68YhIK`{E z>j|<$KQo^2I$=6mA2I^httP^6=IA=;u8K{ZD;w=H9jV%@+&VfgtZVLOI`JG5x4lKR&*X_gLcx_bUz%Q?~m9aebWRDxW3Y zJ%-~hXFbK$@>5O+b=t>$O>;mePRt7p$&*Xtg9CX_+?*6#nT~ENxTi_dApf{u`d;{|1XQV{pNF9pptEZORuJXUC1z*XECm zRcj?{-8^Q=v6Nhr-i~k0#fJ0F#Hd~NnTa4%li4&sWa#gS# zJ5*Nx4iXIA9-pmOz3g<(YrFpIv-TE*o}M+kO*6Bbbpl7g7ifZG>LsO=wF^UCMKn-w zn^a_!=uTcIZi^kDRSrD92kwwx|l*3ZuF1m+{;EQkKp>R|j`0ml53;ZJNFhmfVp)bLfbZB?+w%uC0-T zgSA*Kh;PjIAOycTySUDlT$&ctvJt;gl9u&Wx9^)}@e-d7i+h}CB-6up)S6jt=;rHt zJyzEvq3e)&aUsO!U3K48u@^yh@#fi^#u(v_hccR#6~x2$m-Gyph>rr_dS%kWmeX)6 z>xRA{xWF>sRM2ffmWKf25C5r=h*{yuV?l5r*7&CfANC*goS&IfSn4I)sgdRG=0VCX zkl%)()pAGt9&V_eJd+N|dz|Rs-Lg0d4}GnqckbTfxfvsdwnPShn$IO-F;|5`gH?q5 z>_9;tah&Aa-fdma!XTyyN4{Qjk(&>*!M;Vk<`( zfC%O+j?B&i0U1AolE;)zOh=G~|9~{x|Cc9z#{5S=%}mXk#ob;No_Z$FauVl@-;19* zeYQU^$U{cw(1|^9+WcIcZ^{q?{&mJMzwS|kXmTJ#hd0#p;xn+qTS;nO zl3_~fOQ$>2GP`m7xInlV>Lj@*llq7xjBIXe5ZyOo z1${Hqo!)M#06av+HDeiBpjC`vC-phdcd2xMCJrEl`H})!zM*(81;NCsEdvJOA6nKO zW|retPLVTN+<}N+`$0pQh(wajACTkDcQ9mYkZtsffJQMN5Z3~Cgj(c5c<(yMvIOm4 z{_}(V=ZO918vD;(`Jczd|1AfsE-nqx)Ca2(Sgxn0@cReTiia#JuG~D#wl?-3A0z#r z9$x%=jiCMLv5%rIekH>{Q&p(7ucEAr{BLAGP&$61hi)~!YRZ0wOFkdc`kyi1zi|lv zzi4@~S-VcC8X7&tukDnyoy&dbr?xZz)ZofPZ?W&7CUC%zwW|b2(QBp_U1-f!C(><+ znQ4s}p+UK&ho{@l9G&Hsdyu&e+#K@$&IyOQHOU(vZ4fqrW6=OWioM4`(VmrF^n>#w zJf)qxa<%U<^P|#roC?Ah^L+f%2K-`XE}SvqDulVvbm)$(5sUy@(WebY8C<_b)Zp-m zO#RRtxitQPwD-Yzx3QOg!JM}am8$Ap0&YVUza=_G%pw%Bnt`B+E^bJLyMI$_jOOi} zoQ_W$6zZ!T6jAN#KEL<6fUdj<+cKtZqX#B~I>XEe#-_54j1k^aj3ypcU9=9)9qvk3 zY@Hhmf~!GHQk*VFVCQk%!Gtng2|VA{$c|M($(KO9<(eE#UE@1%ZDeVOa|^A}Ez z&dyzYUu(AKn&zkPUF>#j3iocCz@@&y$qxA4;fvHvKv}lK5{w1zT^tTcHw~ULv_3{? zc9i?%&b8-Cizjr?hxm*eM`BHkhYDeLSw~p&F1hKHs~Zcpz>6jplrm|Fk+V&#vz zab9+|u*jpT_IB{iN*#`lmz-}IA+*H>PXs3#sgi;RaHJXJCNtphs%MFC$X3Sv~rXk^E`a!P~v|InU?^mr0V0tR(7__771< zKCRy}fqO7SXuS*gOeDWnP4YbZtA|@3?azw^2q;kN`BcHgj+-z~(Y09{6= z;F_D7Vcq@~3wi6~-Ayv>nTQS4qEQ@r5=v-W<2p$~3#98uLy(pFM7rK-VPxj?#+1nCL$h zK>yc199qqHe^l+87pY!G0IF*47nVX7JLFIHo)*^QI+IF1n5X!S?)!EECN$nWd z)w^Y@p%ubl`)RU1A*9iV!ndidv^nzI?gC|cSc0LPyn4DPRWiKoJ^MS`pIJS1h=JZu zA5CP^>3$4gV$gw@%Iv`kuw>TYQ|_`OPVc7XqYDl)iWQkHwm(i)rr*!%Jrwjf!$ga% zC%m|OUXHTO5KeWX)TD;Mn*H?go#GpT-%%y0pPsBgc)Ol>qsu znf0CY)i1_`P#+gZR(NC(V~hc(6Ptt{h6~}mHF#;9-EjWK+ddkGW=q$azD{$Sk)B$# z7S5>#E`4v8ytn6Qdm+EWWG51wU}^bM7@;kEYJm{VFYtvKv#2=y?0JQ{6TH8OnPg9K?nVf=*aKOe>HJp+gN8Yv~OPT&ubJ-bJSKr|JTz&3JhHc)5yX;&Q9p8P8 zJ?^uuz`2a2S$KCYT5-tJ>6yg#l(yXnoD2R}qy9}T2dotfWz*?=?0N?aT&nP(8g zSrm?M$B@s$cxgc3OH88C2?lTH)Q5laX6`xftG#4+-)&>hb?WD=ce`(z8tvUof$VTn z5zL}z%n~&FH=NMVN0M#^=Mo~eNSBql_kFv?&ubW!d?+m~GeM!6s2P;awi%yDg3?{k zQwTwnbqV@?o9wV5CwZe|Qq5B%S%c(2bTdshxPa>Wq8H+}np9D1IseN`1kwx#&DSo` zr5IM^iQ9DNwP}qEv`lz4;d&wESbCjnf9Wf)J%e?+tOft|jLHk9%3mtfAhtj%e_@n$ z*qWTx_)(?T#SPaPsM*2G6f(FtH$SiSp=P?4RM;W>q?$$!p7+gl_XLaRNj2w-pezlyvgAT zUY3P>GG{iZrL-#gG0?Qgj>Yzfl7S{BUs*Jd?$Kc{9E%<7)Q#rb6%u|r`dlBO~|kpS11;^a3M0lTo!UXhTp)ES@s9S z-_`BZ_laJrM9b@@gb4YBAR@cWH4ZtX3{qSSk^Z|vGT{ZY8R-5xH!$QPBumExZi;fD z0d{%X007P)j7iqs$$jWwq4;cR%t_S!B6YOrf;LCHQ>T}um+a3T6ItjYE*;O$5G5B= zN11sHb~3$_`bInPd1WlGYshNP)!Fa8U&F3@JxRVCp)ka`!tu7eI`!!_iO%OsFB@~e z)Rdi6%LH~5K&c#S;bRp|#sLB3-jLISHQ{;PH(loDbS0N8JyNbO&s)Dv-tK({Im+T` zzcRmGq`0c4e7*3(KF2BEF%3m(ZBI(_b2**)zF=glHvq2dT z3Somtn$mKTy0LKPG=@<1tSkLFN^kso%~qd+f47T9(+N4f?@pg);m?4_64^?F*VPrp zL(%BseRkAOMv0=eb}N|CuV7~;^Uk%L`Q={o_(zBr2tw^IXTFz&pIC~FFdBx8?Pl#J>!el( zI-aAYm$%(3>h-^V6{i0yD(%kAB*U=*kJ|;H0&)0H{r$iGf$biq{2p_ZQ3m~iVR28< zWl;Q}CQmf{UN=%C*v)*~ZZw2CN}8Ur&z*A;4CGySA;3LL0(tNt*^dfmV$sm>2}BZ{ zKbY^#=!KMNSF`NJv-!b-eUMAF8ApNrC>b*)n5#ikhLhw1QsmG9Dr&mNm=*MI9YTzhsGOSyTn{kXH+9Bbb z6x?eN-^#3eFNkZq^CO)#a@V2?g1*k0z_EESZyQ<@Y^Rl|*D1!OPhH&oxS!Aah>8Yk zsfQZ2xqtQlG4rNtwO{Ts_Z~4J?U+b!#@>*sMnFlyxn0gbFf9*%Xf3xYWU0rA=ZG<8 zK_C#ZT`|V|%rppc<`MNV4+K7aCj2M;%6-TTEECb_Ou9wM>uG~BA2VEw7{2ZH@7QD(XXhLu_nW1L5mj(Wa|U~hp& z7i&t2GmboKZR^?1^|3O_+@f%iTnI}KaZL)*ps9RMYLm&=494>rMox6VJ~AKavDFR{ zT?5+2*j7Q|<@!NCiUEVH?Y03-)+FFFKMW`$dpy`KyqB z0<49JKOjr$k(|tAAVDt)6JcA!G;FV169zNqkH?iD)(+8rD?twOD1X16JZyAn!Ct{e{+Z8>Sre+JL#bmxi*_jHfz z9A&qm>y!s)LGM~QN0$h2gJ`s{X=t7`j;L#V((ye7v$t2c=0#I+X)Hn>oWDby?Kraa zf@l{4o?|W&-hPViuPBl_Z}W9X9crSU<@IhvviGitP~0{cCK)u#bA-Uq^05R zWy^i8KAv`+wVP_CaI&xoFlSq+-g$~=9Hxbl=er&@W(tJAbh|KJqo$lNZ+Ql>5SEom z+Y31dZrs~0z2xo1!1Y#&4DJ4iA%e(Nbxd*ur(oZC%S(1PN?7c1=CNOwQy*r9O3*161lKr$BAEHXES*R-4(O%-?_Z+uL}tIp^Fy1$OKINMuM85C&ke% z@6YsVSlAH=uR;2cVNb8z^IdAqoSCO@q9 z&rT@+{LRNzRZR-KB&r>D#8y~ncb2Y~uC8H*{?P6Q;OEIsnISg*u0y+1pg^k%dD>sm zO;`(l<)^$X&@l7Nk}OrhPXDzMX~&)kFe{-2S}(b#6Cq4$NE{(j6jP9Dk3J~+K!vpP zAOYVdcf}Zh+XtSCX-2LOkdeq&WNd1iY*kOQ-_eVyvu`vVphO47xYGhk=0?lHJ+ z0t3a^H(4Ly5-k>f;J(>SW8#`?tX%z#cRGmER(%d#^7_|3W6nUmGw!o(1ph4w!I=;) zPyHS2!FF0)q zJ@_Hd#S=WJ>I0qqOXN^D5LCfQf=0Fc!@^yl2}@up4L#yC19{MJ?^9=4idd)B0nMY0 zXAFojhTfeBvP0k$tZQdIat}uE>!^FgNqe_zkzU=66~aRgFNF5sNLCH}v_e>^T)t$I z19ARr?!mMA`iH%|^(2iw~P7Jldt zaR!k+shu=7x=Hx+<#sHu;iaG#s}s4fa_imsZsRZaeNzak56C^{n2^J9v>-Yi5()1@ zCcCrKtO*7-y?RnzU^9$A?L9ZBH1bac#2r240_RLErGB83Q?V!?nCJ`rWMo0}z zy7leH#Zz*zn+~JijSX3ur{Aw7_0k=#%anlftQYTBrNuN~_JKp$1a1Vd;Worocw8GW z$?{Gyj8L0@UD~(5rG8}nz#HwsId?Op@?n-D{{`FKP!OUZ+Hsk19<(tn)K-{rh!WD) z=o>&(DeExQ*;gk;@B&?4j>g5BOYSZQWYEUSzYf_mHl)=OkRm7$&ctS;q&FElFA(v@ zGBj()$&5nkP2K}-uQp|W2ToC+i~;TATMB-(D5xv#!iymq54({QA#Rx-S}Q~YvT!Uz9%yFI0-5V(yUXE$i3 zg7CzpQ;`jOdr8OWhfrq24KTq~>KaG=^EDnKZ`%F}(j}>sHQ9{)o?aI<17VNZAz#{* zK#Q&>IPkjgdxi6XrO%7f%^$I0JM;O|CA&-23(pK6n~jBZaeYY9u1wQmR2JXxfIZz0 zOpZSTtKXw^18HkZV|ioFK)&dTk9q5eYuY_(bO!F|z-;8Ns9J_m85_o1<2Eo%J3&AA z4GT52F8d~`|1&3{%1`lQ+DL0rR7J%@Pq_ju6#Ek>D(-Gj0XGCxC9wtzjxzJnyaVLr zUYCMYOu2DmNLh05!)LiL^^?hG19Ej|LOu0<1@(Q6F^(;SC4Oh|W7=&D1*?!23&6~# z!(D#p>g#MjpT;qKki^k!!L+7N3lI~cgYJs*uIh*2^@os>jOeGdLFy>2w}h5xMcyKU z=Dd+2CtaMT)ZPKQsGUEVu&-!qa5`H7ghMfIjkl9Y;Q@~6q_RfV6e2vy7MF_`=c7FpI;Zd$Enq?eH{rHCh~9fpLc1S{=9O*BM>rv$J#f$L!lz#|G;R9j?jGA@1Ymfkofp+U-J>WGYkEjfH_Fe18Xt!b-2M%I(g%z-RBv35 zi#k0LaS~19fCi{q%0fZmL2MnSzrqYmoH;1Sd%9f6@A(b~!3Au@8*`5^jOYfaAadh& ztVSi_?Wh{cKJd-C&#jse8TXqs%7IePBf2y{536P{XwnXVcd3 zs=%u)+b%HUvRP7(GgBSWx*wbKYT*iWh3PV4%{=s@=fJtB)QJ5Kb8KF+-ZS}-UmH{b z<|9TRT>~XI&R|<4IiDCiCViqSOrqw6?#lW1*%uHT7y4of-DZotAyL#Os>5G%yhtOSR!wFverri)LU`X7& zi0j@+$rtU`m_ZN=(updzynW63774-1Gwa7q-qZ4e?aLhc-oA78g@F7##>6;-`OKK> zJb-XHK!y{q^QLH2VO;2-zs5KFH+O`mzt2$dAam_cEb7I?U$j?H6sOJH7n4sPb{cDlyN+1 z@EL0~Xmy&4+?pFReK9xP5Fe=dKrvI!utp=`3a9ba8!!JwzudjS z3*)^O?Ruq2%H6hGVs)vSL~{rGjCkq`7vGxIJ$lfeAqgj!U9QN#u@FTopp z(zrlJ`5~G1z z-mz#7Dneqd^XzC(p5cgC5#N!nwkuYRxX2cDLOVEA^8Nw)_*K6cwm!LtO-44tI&9PP z_f-{{7SVXas5z~(n`*PKvmQhos@2rtq&)#H8Ne*5l$f`Ei!#%Wm2AXjVrw_LT9rEU z6H}(1O>Wt^A%?wk9tCIM-ew`AmioM$oV*YH0nr{})9Ao`#x259SX#aqIPSVevI{ab zq=rA~%TPnKB3FH5j#SI_sgz^8v^&{P{(8*e?f=RH7R7vy@gC$uj{^FuO(HynvaaBp zO}82IOtPfv)FyB%A2%m7&WYdTcJ|=Xm4z-LX;(LB2i-Q}kPF@0B;JKFa^}>h$J2M! zCx4tLecr!m{9aL^#^Iul{a3*;=qgw;IBGO7Sr{ev41xx+&vMtUS4EVq3M*d-tdTxz zSjou)3SL`C=j^j@T5^8E3V;>>nV^!vSZfUgHW*KAa=vlQ{Nb$9$zVQT*i=c*gX)(F}4MT~c z1=2C}T?0V6njAJ`4BCfnwgJ{&UkX-|5%9*KXh4V93bUpM3_k*+; z%IE2UN;lOF_JSu-3f2lumu(e(4T>A^q8=qFnqCxQn@4{@HhsNES?kH~?ZAjJ^vBUM zps30$dHIp~n4%`P=kX$Hy+VM3+xs4E(;tsx-)Tb(u)ut!n0AY<9&QX1qRmwYY&7a* zODkE`LuQeuV)nHo4(%Y_+vXQUGmfNIH|hSG8H>8sCexaIEalvr*GG3> zhTQ%7VUK_*du;jQ*Ja>@ifh1A_1tu7?(Ne8wfuLrFV|Ct`mRNmgz>CnToD(Fuk7vYE z@O`fG=X2)QcUeMr$3*nRfvlL10*f~cET$Py3z!ARHsV9y(0_$WjBVQl`M!Dyv3jF0 zU6J)V=Rvipmwg{(+ZK>6G-iIAz&z@l47h~u+;y3Eb>S7nP;FSZNONC*-Sm_WYFM6* zF+2e7FTTYMiC*;@x&iuqfjTXoFd&P&Ho@rk_lVe~V!ek)HZ<#g_G|i-_*ASiTfQx0 zFO1NegyLk!C$>q_?m5uzg_UXN`7fw=`Ko_;uxCS9^-9EfaEZ}xOCIL6V|N7r94?K) zx9AV+L^kxUk5;3!M_NuOr_8wI{(6?8T9^K*Mo49U0LSr~*PI6YKG5}=y|kLnc`kBK z>7vHoYgxWK+L^wJB+~3d6%plU4_|89LSxtz79dt&-5yP3@xUpry@L2~>_XT&AkM^S z8fWs)vOfDY$5)RZeLIk@_C+#5U&Ye=24FQe@bz$#*@DJ&VuvVj1$@#v|L}#cP*qcX zvHK!v-_gSc5mJW!jZJMFp)chuWg+3)=>Y4V7bU~H17%QhV=PfWmxQ;3M{lDMCXFNB zes41cF7~HA?UCEXS9F^`BBZ}7TvqBU!2H?w(W=NjJDm`L-ZFHdHiiK?<4zp$1Na7GWtUZt^~45}ebJ53_i_aZc##4&M3Y1GVa$tNy6* zF-2X~zOtEd6Y=Sr&Q%ap6`hAQgqLm+2hMHA?BvY#hc)4EJ1ffE%nfb*D}M>p4MfOW z&O1-xnHR2dE-zpD_8Gc{pGH#ejiDhY2y|&E=()m@T`N$|4sTdflhBykxFhG4YvvrR zV97(&?sNJENXT?+7SBP74k4lHnJxV!tG_^jtdkIudOJ|2I*)YYl7ct8gov{T+vP{& zO%(#ady!pazOAYJY5Td8C{tQ`u{X#sti)w-_i&Sg<@8q$C&yUlWbz4p;D)ztGCFYDp&N@{kmxxtGJK)!;i5Gx>5Xm2#-491 z#nIhQXB_KBFE>9i?3>mZFy|oJPu-k)=a~Ze0a`gz=fMaLQ*L#_VJ#tqWc_f@4v6zz9Cm-NuO#q!~%0PLtzycXNi&Nf3tc(Lc+(|!sc;xbIMcj5egy7j7a z^N$l6Q?=LzP2Q`5&*yL6e6@_$D}{NCzxzsMEd+5CdM~tC05kq=HL7b%JRWnp+2)Yt&v{#fR{lFEus%h!3I{aRx$ii~T z$;sJCGGEj5n+c4G#RY`Xo=`3$V}M_t;I%&>e1>6P3@38KnmK%`sY(yt^IAw2i*@QV zDl5ynjBjuOUnxh{XMNau6%mn`AC_XMa-gV$HvRp(Bv0*0yi#lxkn|i1mCX+)rS43-^c;cn>iYbpv-# z3=Jq4v?mRv0kix`AGqMoVEE5a-y}3Nn@@Z=Rq4!~G^?2{>Fa&G-HOLTi){s4k0lvU zlomPDej?dWmdr<6|Hy@@#Q z8rnfv4_16FU%jGg?wW*%gmzYm?+x*Q5XoN&xx*a~cB?xnDHIIEh$lI)twjPOO%ZG2 zWT#c-&%ty|ZA~gli6A)>pGdc*n+&Kzw)SN1lDm~5Sa5Vw9JF?QR0y^qT8aE5F(cT` zWj>nM;bo>{bL(7_c2h9Ff&z26;mdNV-Zv9C9EoW+Yq43x39aS>i%z`SY_SyT;gFlQpO{G#mv+42JU>9|TPKA@bfa zYQ0-^g(~S-lI5gAy`g>DKIJ%1z1c@!*ExZfP$}y^t(cS&Mqw3t|p^^v({EJe2uC=2}oYZ4WF<%zG#-xpUk z*0dyi9BtKfRL^eV_SYlry6-siSaVn&G#_HUYSG`?g&Z1p=|J3b$#j|Tyvl{S6J-6Q zeL8kLiCH;38DdAowAiAkqLV*qh|5Um+Wesi7h*#FWN4EnfX{bAC8vdZ0 z)H}q&Lt`%5!DzbSH3@tdleb+Wqr5!4n%u74>hG&cRE(WEu5y~!f4)weGxT>Crcr`z zBAl0TFgOGU4d?BcHtOy=ARJNhEtI;Ua5B~rb&A{J+?k_kWk3^}h`UBQ2pSyCY!kxP zmudzS+60ES7AvoX`l;;yC}&lAUwal_pe&mv*vJ1_{y>cJ9cCHM3m=aVG=|U~bw+x3 zg6^W<-ehE!Ty*yl-RC6eYV_p8k@~I|!(}Rxf{l4ad&2uf*oYV?&KD*)4Okh?keVb1 zWZfEzdt}2<$1mcoYW+dW>tB{>SpjOjMyI@6A;Ea^8RXO?ffY~JU%?BYEC^~UP86N= zuAO)i>1W-=KD7_kcRQ+N4bJvJ4@+P8bnA8uKRfE!GuHdx8*78#*BCNdG{R7#+%gY- zJL>;wAYAE|`pKXSQQ?`c1-5u${c5&%5GK$69xOBL2`*qThNZ`R+QwPhxsh1&y!yqP z@#;bwW_yChA+7s5ANIZ;$T$LIL5mBdLu<#$8HrDkB0Bo$NB(}=ojG`^Q&n-ZS0o^x z|M&Phe@z?wkN@{iFuFt?RUT zBGutR;a96_tA|sWxAGV<(m>Z-&=M&1n}S^?(9b{Z2*L22o_uD$r#qNwvTMHOga9xV z;DMA=n%>X!;>qVQcUXdylg8+*VxQyGiRqiCZ4K@4ZVD-_N5A&&i$>k$zrhYc$JznE zbH-lV&ZYtlS{7Z8c7~DVSGAU(`$TVd+|ozmJV}oXi#U9m!QYE^R)~;)|TP;E{d! zeQ3gcw%2a=>B1(`I1=s>F#=Yx24_aG|uRJUC7_d;#u_a3!)$S7Iw3v>R3%`Wxi6c z%A5IbXJAE~r7yP%d9Z*@5L(*@e+kHOVsmz=jO>_}tHHLp{1Ik>bsDZr(-clLgV@n* zDN)o#GO`oSLvwD&*1BgMNPO-?&7}&JCRGjDawP{HlnI>2wB71gwfY((OjW1Z(9OdA zsMWMYU|=2CPFV1V_DZ;ocQujmGaq*$8^?r{?WNMXvZx9DV!s|yARm}dT8=&kKm++K zGTnvAY;d$G^4%7i6217qJEv;D=D~GTQ;u=A1Y{fZS@(~BUPPBhnOncwXUbmnr0YtC z!^zf|3fKuIxAY41J~?>vBBovayF$V7bz{NRpjgbCH>lNOG6dEUqXiCZ#E+{ie&FZV z>I^L3mkasez%q9hfKBb7mxwl&oxx4syhzKUNc_;c?-ObDxrs0tnf#1rz0#BUb)R+z z>cd$9OU?mc%>cxb(}TV6`cJif6lbd2sG%r9yR|l_(SoAbxb(5%X~++XknYX%PwuQQ z=m9J90NSKhGn{OFUa7!5LV;=01%f`;Ois|bFEzhTd}pGLA*xvOoBgYSA--Yu{;r(|2Hy?Q*vc@L#T zTcq2d2XGBK35cthX;1gW9<&SvHGF}F=`uf|_)Zg%1C6NMoee#BHtWDZxySn@lX=MR z%sO~b4AX9k@W#W|w1;Oe*zz^hpOAfjs@DCZdXIGbRII)L*iJkh9yd3fJi}<-#WvVr!aacxs!{OO^)O#7r*^E#g2zf578zI5? z&29_Q)NgTVsg##YK8Ig35s06YxzZ?2ZR&CHXUOOVaBif zqlD>x=wS@Z@Th1Gf%g5%>kP3Bx!c7p9$B|%d_hCq_#r=Qbh8eQv$$#d z2Z@-DR*YS=gWCa!8S{B0B$+<{Wns&2RCd6)It+*B5>&p1GycvypJcSS3&>c>|)S`(P;jVjt9 z{VAUQrLG&u6LmzuKJ5YtLmp}Jl6M)a542PCXuYjyx#fAIC?QDUW^pm>#(Nk65NFoJ zXjcI5$C_4Gd8>ZB$r7{Csynhr_L0NBYTfnycjMOiAW?(soP;O9MWfEP^YfN#_#$g$ zbjI;*u++dUt+Y=cqOv;ZyP<`JFgEZqcU)X)7S!heB9W&>%8Mpx9K*l7*rHIGr%iUh za6fIaZ2xMoaZ%qFI}XtNV1s1=n_6S_awE3pQ-OB&?GC%!o)eJ${(y9t!^nkv7*6z*G23K|+f$!VTy0(#-ywesho*)M z7KW!&Dl}c;#Z&JanlJ8rS9&R42&?-k{lBE0585T%pwmf)KlR-{iPKK=@HiWZlTL?o zPi736EpgqCySwWK*Y6W6kMz5qA5*7w*m_Ta6lKqboTAsgix0Fs3R0VSZ;#%M-ZYU> zS*YQsVW}`?14EMR*)gv(9+H`W5Vlj*4|sO)?(|2M29KCR*_^BAL1&p4Iu-{e#Kj@f z=#u1OOuJifbIT1xv!O!IOb|4u_P&4-W^wKhhzo@7*&8-7?R}eJR_-pckD7mO_D7zs zT`KRs{;hXfY`^@-2w<5FqbYLs_^^U6pw3JVVK>a6z5!pNu8P8M7sA>Qm#t?&N-mD>GY&LC?xO5j_nFpW@l+~l%ZyodFF*}pTy zi9h?dVuAk=Wy|j7>{ve>{Pv@#{e3gvOM!|XzGD)!96n$`BiL~4`nlf(MFaMonhFsx z9EL{ER<>PQf37c3_L{zs$p_x}zq zfoVUB;#|ZgVIU~I;&5nt{zh^mm*FM<5Vhx-r8ZHT$5m@Y`uc_Hckfxz5lnk24H^M} zi&^we%Q+(cb452=gj|!*=;rPEs$tkkTF16No49&HrpV{}J_YvAFN%-6TR3P`1|7nA zJ&|-KZ?6-`lgB5wDn1@{cuDTT{-eP<^+)9)afS4lD<0vpF~DK{=OR9F-l!EUo)VM} zE#4crzF~OJhhGj?Z|vLpmfyi`lUq_Gv!Ib%>0;S4GBr?BR>+HKhOKW?ywB3&7HX!k z)W~|b6Fn^Gx!5&pwN2!OlJxytqq2ydso|l+-d2ZO%63ELFoAzSrg!a3s|i~(@1W(t z;bBL=93CLFYHc%1Qo$+MlD#|X{^gRBkvlT`iDSDZsR^H1X{XLDWn z^C~tfd{gvT;AQ<)K9(Ucz`0Teyf##k(@6MuG~4W8UzH2=0<$+cbHLE+sHFC_Au*jN zK~vIpVTd(+pt~R#>tc0GW+UF@45 zGKSeUXCw7FzM;>8(=PLik=N0OSKe~Kmrk{n2%J50=rCa8Rv*D_!VI!fv8e6N-_>rs zIv9IhilrxAm6nh|xPMj%5X!?;!U~pqm!-v7h~R`*mz& z*JF_vbrIcMITYV~%JFvT;Uej4vMk%`Y`@b%2a+*e9XzwQ41*6GgNW6FEslK>sNVS_ z4jZL?Daz&^k>7{JPAJJ5a`C@4@tXlzgUigf8pnu80r(?fCr*9bn*D5_o7IpI&tNmA z%qRTe7wh8<()NHm$Lp`gsx1s{(GqrizcjCy0VLJ7;yu!D4mJnez9$#%d;IHIQul?0 z$?&zE9xPig}FuF(q@v@_m7pg zS@%(Qi6B;-#j&)+QzT5N8)mD+xto5wio5d{m6P}Red{G2Z{sjtR^Z%ahCVd5DsIm@ zp^l`V^t}{kG6N)NCBoY%;18Zp(ySIm{XLYM=c6lNSb+jYBuxHgp|? zttY`*cxf-9`{L^fv){x^KCPFt@t)2xQ3^I)=Kk#6$u@Y8h2+8DjUSteF6kKXA!6%0 zBxt4$5=a7h?<-%&O|u*KsHWSddeR@94wua3=dAcH{&@{F8sm#mjd*Y>4w6Q4Nm5Pt zBK^q9Q(ZUv1^c`cLl$l%sVBsqyCW=l@3EiuiQH?O`0~yD_Y{G&y%|}C+6UNqCvg@u z{g=}h&C~BtsqcL=dA>rI9jMda1gK`lZ%U%o-yY_dPF%NO(caq3;K!fr(=eLA4$$jU z#VezF%zE97FMezy8ge-DHxxTO6iqmL%KuYl%yyv}ZSlzdg(|wKr$v2p?T9opjN#xz z-Z~;>U!Gvz&+DM-vEv9co#Wh*>E&g9!);QLV|Nj1qcj;XDt(;HHm>&+IO1@u2w(HW z=tIG}4az_!7xm3;FUTiqbCHo;v8;^!c`@-Cgew(aUeJtUx4vYL;sf*B@uBBt6S^yn zT+#(p#XvJ8CX7>Ad(1;e%fYOm|F`VLpMS>?K6I3QC>M&`&1FS;>idqsp;z`JDC=oQYrMB6mSY7xWRQim#vpR66Yy(8i3$GpNG zX?`go#1IKpkB!Cx=E;K6IDU|QBBVZiG$eI1&3i^QWQ9{FGm7H#WY|cwNDXu}=5be4 zNba+YEDZ83#Kqcv2i*vB)(><>%R?MS~xNDe(+ zTB8MW7Wc0+PIlij-mkEKWMuE>V~g$c_l7G9Ta^m;JOs&(4TbUYW+-x$UyF^_->e9C zh?ECZI%TlGRYq~q{6C$^I&5_yhOl(C%qddp(FE&$zgsH^a+^B514%B&gwW-hux~mV zo~%wc)r$vJwVu#O&3!*~_tnCE&@SBPFN+hP8Pc`=x(FpB@pKEn&XFC?o2FCw%MQ`( z>P@E>)csvhPa;@fnkN_ve?9lK4w24=5cgQNEApE@`koxfeN?2J87B7jFl1ado=jUGZ*xY%2I zEyc-6esXOe);fo!SzaMyalS-=a-QE4y#cmsW)K zZ~E?2_1)uigg?iF-5NnB7iC!|qXY3g5FkZ;!59;0$O+UyPXnI2^4N7B4Op}frzw^v zDjjv{#WXR-$-{|mbt}}GBE;>_HAS_j9V3@dC@d)X(69M*R_~0jodYpRbWo-0z1QW3 zc1JY5o~=1_KFZL`&D=}+06zlZ@7w!diaK1-qLng42!#_P69RC;zSpnkdq!&$x^EqH zw(j0`{cQk|Yd-ALrlt1d@y#c+h495}b&eeY5NPBLsOFu$d$HxEr;lM2`&#MM&eEZH z>2E^*r`Gdn(wp>;^0CY+|M$lo|N0RaTm2~MEMdE6ZUbHfy5Y0LMYA*Sl`kRi4IBBY zJbo0Pl_OepDEM*APX7n43C%l3g8j9||Mlt=|9Qv4|95@k*XRDv#5rUfAY@`fZPZX@ z*!dxnM-43BtDJ0YK4H;^({*yV+tuuAx7kQi|8QUkX2mBUcyZWO8+Gx2U5esg)>N>3 zODdQr1HnFOL>J`X^A&3W?VQ$qXP102Jv5ROLWi8dl;(5s);FM-lXIs&5la?E8G8i znX>;9OR%fDvaIKIz6yKGNq1_YM)$kvmuU%T|E>O7ssL=Cy;Aq!v(+lg|(<6#JoLXm$x~o}@~`{*wSK?$->>uWU$ZY7ywm_Dp5Z}PrRCJuNgV4__L*zb$5@|| z>CWGqu$wz}f+eD`>BNmAW19{E>M)aahCty0X} zu1>C^y2^*s?!^jSdHjf^vl+~h6G40;g#_YcfH|!1tL6c0`IWD%CkqdQIZ=Z&MxKp%KuNz9NYhkmV>RCKr%tGd*X?x zkZLX*D9s(qxoPg*-zY9z>acKNGD*%#cT`k)KJB%`)6a{yknsfo5!d}nWBrGi=0DjH zQ(DrVwSE!|O)@Zptv@2&a$wbRM!bI)|x4Zkhb4prP%3XGTn3#!orKyH?VeDAIF z&-a!FJj~vfy|pHvN|oGV?W?~b@73#ydSdOVj_yAr#tgf%7F9F3ZDBP@DPttqfV)8Zu+t=(%!V%zeBj`fD#pU3q-jueCEUiE?x5 zofBADIbB0_72l_A2d-~Z6*jgOAdJ7$0xEZ1$)3?a5Nseih|NLw9ms{D<|V=YH$M>V zfv{-&gaT8zJQRd1IP|(m^lJEiAT~y(*Ux={rgT8JQwuo}ju>iyXj&*nr2a zl+Y1{Zpr>iMfzSx?*#m@>rvm4JGbiXZ?t5k+B9&}ERr9#9}+qh_1HV-u(jy1*{d(R z6Aw(rD+9^cKdx9V@~=zM@@Jk3{Wli?lMF433f2LErc$da1iptN4daEO>9$IJ5a@v@ z?!#y3^#uzCDDIA+Y`{BGL(*XU~ro2V6w zb|$Z7^BpRr9{tVXC_w*yI+jtty+$^Ms!S7hL&=#NSWOy4Q9;cKm;q}xvG$K&ye=aX z;C%x<*XoNLD4yk1i@tkSa`fq20<%+CQ2|3PkZ*{-eTL;IHUHT`r~6GE1LL!=jC>9@ z_B3?otY48HphkLco$4TzlrLC7bTloaZiKhnPw*10iQahVQX9ven^${d>MZuNJ)##0 zrDh1_?K4U7%DLA&G<|o%l-bfwRY($K)FTN4J3 zj;^nCr2IhG(467G-D@EX)zL=uhO#DLYO+lLGiDFBfbG}u0yQ@Zs@I*@w7@Y5rqi;4 zEea`s#u&hRpnkhZJ=15Lu)z^Bf+CGVfS57exw^zaMZ>xobQwS|)_MXJ;6P?fCx%`R zi!KrIOc4n{*b5|sV_5=Sn}DWzkgH3fc}6Xn#lF=Xk{YNR4t& zD{&_`%PtOl(MAkfq&?p{EQ8G?>qy+}y|K3yQlA`~*@GI7yC$?~VzD3Sg0=82i36mB zhs3OsBN-QZ@=94)d|`_ep6a$QqJ=wi$i~T;-~d+6={^Y^~4gA=1d5KR_wjkU?d0$B)gdLk-+w!M)y~PrAhnX z3yjkrFM=*|AuP{?Y^>EoE5gW1^dDZGKJ|<$?0e|H@Zy%s!SpCw&{iu9%UxpZBeT`! zc5EFPn(m&HxZeKy;Dy-hk(=fzrozkH+o6zbm6nS^r+@#{ zj(Pd-QK!viJV8+jl|Uew6|5iB5&)r&i30P$yENn%V^vtEkFZ*cUg6dj_?685zmEI< z6NtDRJ&1vgM~M*g;a~?G+X}kMe;@#cpM2&C!IkkF{S)t&7!i(8C|o-Rt;^hj#`G`P z7_RF4B{RU51-7UfdNQ5i23l z>pVkL(Wu^#i6=!dnA;JCETjZ*+? z5}eG)DkvVqt_!y^kjoEeL8I>yvI`EGHuHP$t4LN@C;*q1CmF$2D$z^S62Hyg{?yN-(S%$vU;+MUuA2>=m0}UJc zozo&0Aeg5?H3!*>rS0!$B%;`%FRujQgiD=l^LvwlxcVG+V+_BsEsg!p^GiH!@N~VK zN!bgrIex8FKcP}zv_9#g=kp~CRgS%eNuCosy*vkXc=cSLt%CAs_dhev{-0mB`cG0* ztnFwA^tvka@CSk_N1a{-QNgMyjEMokeYig~aUoegZA$&^~a8WO+m zCDZ$RO-UjV9PMfFfe>;(Oen#A^T8Hg!&2lkrlDOf{ftMR1Tvuob;7TYKyd=92J_Yx z;9SY|X`Ln!0w9W>2J#2uGCPXNQ%c+G1;Q5%ecCrL)yV}~;$4NF4);HD{r!8yvggHt znPQz-Vn%{oih3Ax4)o^5-as+98PP@Ht}5!gQ0Cwe=;m3$m39E?IN20-<95?UK!X!u z8of&P0lW3E1(9J3zQ#KhoZ#+rl&9JB&(+x2g;L5?D&Jq2PIu779t=FPfk5XwKvQ!fRb=MT>A%D~%RN z)Mwz~3@~PG1I65TidGF`@z`rn6Jei(xeH^<+=fzjE=B44U(fyyW&vSRmqcJji8xad zssXKQ(oTVdZkNf8|ADAP>u;da%{FFMni~jZ1je)ST7`|3KJDa2jWU@$M@1T#xeNYvK^&70!1-F zZ>%9dQ%|i$xBmVd9O!`_7?W%Ivv*kreyoFKZNM)P_H5vLC6G|du^$MRJ?kgwO_i*x zhkzYI%R^(iSjr`wJYWH5VmB@TuFIPwXGUNiGpvY}$?9 zIHhH(`~%^g*h;tWDc zg07_aBK&OqJ`yOW7om9-FGe4r=Y*h#t8c;PoVv?X@V@?v)|wE1vt=b-wGJ~j<8X55Mlm&I2c=T z(4LjbYy1Nd<5s}DgJEtyyXDUo#y8(MZ?afxre`RF*nwCF`XN1J4n!Cw&KuHH+F zGQP1028nGv^1fGObz*yPosvkifTS)5{{y_z^>?*0tO(TEKMi*NcTo9%nVDZ^=9iiI zLy)rLV&AjD>PRRvt7k}Mp`?s>P1rs>-MYp@>KH>7WC3oV{gW)_+emi0H1&+R;<>Z- zl&_zhrvk3TxuT`B>JO!6MnKh&f*nk=m3yiUddlm$Zf zK$wqhp&Vp#%j@v8W59bwpMzgqK~GH*kQ5{>;%PbKXN@NwqUikXRbPUxzC-wB3N~b& zjiGmX2*v@p(Lz|?1xB5-(49cmMV^!mb6o{vDnCQ7o?zD373vv*u+(1gCoPcYm>gxq zKz07iGEnVFpn=b_DMI8~^yg193`XKO48D;&9v~Mys3K9EVS_cy7uPYUap}SP{)Q9} zVtasX7KZq)q$SU!o2njUyY1bw)`a3EuUGx~zPU3bPs_2F`bai6ekaVQVqE(R*Qng+ zoVdsO?g^uEsY7&a+6iki$bzFqXma&OT3mAGE^0QWJi|yf70BiD30y^NB+Y2_Orp2p z)R`4H(QzlKI(O7IoS;fs^mWchCdRK{tQmC5%0H}WVSQ5&*+?=+3Wugjx=BAv_O{Be ziawZccIx%;aO(S{pZNoL6t$vP$qvd~L0~ObJK*%o&=dd`NQx5Nl)>mAA!z`EI7e?t z*=zy#RQCfx-avmofMhW6U>TT^`s?^H)E_Ui0==|Et*@iOlF~(@m(exoaTE~Zn2z)W z0E`WMSu+4!XWyWgDG+!+R29^B$RDah({WIj2tc}A;D1GT{rjCD=Wn7v{Xnn+__DXT3QRv30OPsG85SMIaRH#Qal<#LW_B{@G;5L_b%JI(R)OAZPuQWc z+r!}+<-L*H>!xeA(h*bYH&}BedMWR+w~xDCI$daHC;hdow8VVg>q|sgs$|Biy^HO| zF!v9{&;%UX9{`c=5Ey=yKMLlgF<|fSc8MXb6NgNmN8D)<)t9iw?aBc0Jum2YDEL90> z_W0Xp%Im{*-A-A)YrVrOcUGi=U%YCZ_I0%hct@UEkR!QLgL@$dPK61!+734R!LNDQ zLAG7j;I7Y70VfIeDZ@?&U*E}pqIpc~@%I}8)*7S8)th-)_f|$Rrg+|vsJ#;HXPN;* z{^NsT75{xFXn+Xlx4rOBTqHJz6qpUu%;YZE1j4s*h{ik*q z3^^@@`w2;K_gC}NPJQhBT7R#5&{5ILzsZpG`;!RQ8Jh&ty0^D7xL1=*0Y8DPUD`68)cXh)4b+o4_u2NrRvu^EzE~@% zo69}%Hs&smy4&p!XgVmzS0_v-&CR-Aebd!=BJRxKaKqnjp8{~w|9aZQ{;nJ7Ppr+K zuph1xnzTK1N1Rz%R|%SragdZ16ck+S_^PtcyTQh8@!@7%&mk!;%xRL`=Z{CRobgvB zBqJujCiQ)F8t@W7daF8B07rJhrsv5QffaJbWLpsU``GP^hkqS%IF8^;G)lSAMgoeGwRnJ~+;Yz%0Ri zAg=Ay#WE5>kc_06kCxjpc@(xnW8cv~PkGlv*Z@XLA(FY}L3MU7~vX?y%jWgU@EU>y`c0%-pR8CZH zTX@GDar$EfG0u{I`$q*qk@PWN){^{CDIh$yZdsk)x}GgG?SsAL-iGlk%U34X5(i+938w2O zV;&3J$Cm~Vt`R#Lm$o^GXg(VeO4L7b{Mx?;HAW`cV5?C)IXyc=~LXS#Te6Esa^ zGh#D&+jj4liftM;r~=aHVF+el4>Tzb0)tDuAnM>_x$!6ABWo-OfEw{35PLRhwICX2 zyfs!KY}N9frnq#NzN$y}GyNU&=Cv{vm%Wn^b1(l6DC+MD$^N@9Mk0vA4XC22h7N&Q z8#Qus-CH+J4~s{}KJIQ-w1|8@S#_zDvv<<&u$}rjus+1quL-@Pdl`f~BU!-70LAnA zcLZD1hU6P)Y{>+8k1*gphDQsdncS9WlE*8Eq6@sjqVtz#IP^7&rv90*E(zR_hw3~C ztWKIE8RD~meGA-+dNLJ+w9n<(V6IhAwEj7`pp6bVvR(XvAlmf?&Pw{X0SDii5zurP zxRt;vBVbs6>qP%`Q@>W~*Pi-cdHUG8xtn5|3iaNeJe#A5n9=6?v*9$Y_-C&CVRUo! zfI_{mZga!SRmtk-eMaUTFj^bs@QZWsPmDl+kAwg3ez1tW(NR31yl*!MFR!yXa!qu0 zZvJP6&;NIK{jajR|M2_t5)4Dz=K(p862L>h(SkeXUoDb0IL~my--8E7-J*edQf*YVjHPVM8e8AMVbo{ z;_y^z8aWVX)K_`^-hmQz!`+Vusjkv44@|<$&aB&_6j{G-bZ%mtp>>d2$9nUh`jC|x zW9;wDsX|u@y{;r+)Npo}DZPcwt^wNOc?dq1$FDpK>Ne!-F3d?tcfYoae-QSaax|qn zAat)k(9~gnVnQ6emauQ@xvoGth>JZv9s>0opqqyxXJmt(XP6y{}id zp{Md*&+YfSX^_TV!t(T`C%*oo@x0pYS48o{?TZsk7uB)5V z_1kL}iQGDyPq?Rw^NI^k+mG1o!hO|2Y)Jme6w3b^uZj!;T#PqpiW4jd zDBUAHfQ_&4a#J5Dx^GTnCNA9zmm63`ig_gpbe*F0rn9=r%Ano?Why%oTRjv2y|Jr9 za^K{3+$c*K-?r3a_Ib%oa;-@BK(%qv_~mdflanV{wzo)=J6ZcH#fhN64FZh1_ZE?r zQZJbV-M$6es>1h_R0TCbVeMwz)f}7~%_0MLsoQTenCMQyx#ka)!uTxT z;BS@F{Cn@hcbl$Y6Gm)<@A%m~i}1(A(tV+p0l$wZs_Ka!h-`T^e^ATbas|}`v zl++J|?pd}GWtQdU1v1$dxZtgX8(1FCf|^8=3y!KvRt_;({k8c%uT7P?(?hEJ%gt_> z9OGjdgIEY-NrcT>k_>6mR0BTDW)B(jkYO~Td2qq3uEiS(UMSJIbP$jdxVv5jPJ+B|>yj3O84b_5gYV?bX3 zpCHwM-2ZSFDLDkr;C)6eUjg)VO$JcP3kFVilSvjy=>$o}5!c?E* zP}Zli~9 z03{pR1#;kcc2fr@QV=Ik_Du{`F5De#=(HwPzY8&+Aj2+ge4nfT>>ZtHd)Cx`=qA!X zRgfGKrm=;HeW>R5IFEL)P99@bUKwU@(l?^BJ@Ucp`^VUQUm$pn+}NwFm@3N}L>guA z$x)A!EkZL$P03{MK~W$!v@2olOhrlgpJ7o>V~0c6NV*mdUeh{rGK! zTlfLOM8ex@0h$MSf8Uaon}S4&hok1BWyE%^vnhP1v^U2Dg(BiWuk`hI@5+kG6DSno z(Om%|hSwk0W!i5wQq({oTK4*Dx%5F}&%vSW3&}eY;!5@teJB;15l3t_s(M#;oa^wO zD4u-ejpOwr{+b=N*qs`@WYN05ks~D4hbv6y6BXv31$ldssMfVW4|AJ{*R1=-{OwtZ zKV?*E_5H-_>6h%yMWDuW$U07ilkY+-D*}y^Je^hac;@%>2t0l4vW+9qK|RXgyT4XF z4NYzy_h|0mvi2+&bpg=Won!4T_ZBSLWFj-MU{aAj*4E4B>=339^t7 z(V5N)8vCb1(vXkPWAP}8qacc^JPLl3>y21}#K3KUlGrsu$i`}Oup2(C)B2`BoKU{t z={N7HqV}?F7QPypA-z*)M?=J2pbe$P1$T~ag|!B3u(& z<;%%Wo4+M?dEuxsOQ0+LvR_--MeM`U@E?eYgZ z5woh(#2s}|9Bm7Pc z-0v-dz1U+c_0k1}hY|bWP9VCu8}wdzduD@lcL_94&vWd9#eHqWtfE-k&-+&&}qg!=<@4{J)w~F&LHA%?~rg@r=`WQam zUh>3B=&BbV%Tmd|LC{|T3t{@GJc#c$~NYH!bgyMca(Wl6GZ9%RdY zEZxYy?-;torRsRZ5smi4$I>_z21A-qd<<6rla^=!1&pgSAJlz@9%Z0$@1uCH^sCk; zol=V}Ku4>DWf3ZDW+V~nI_UiK(28XmG8qPhxsR8sd7&>EXgGHEGs15aOL4~*g)k$* z7N2Q7lg0+Npd~bEO2YsOOem=NoM%~!M7aDw2xNSFn)=Qr73%&2Oi$a)WU&JOPRbpM1%;qL2Urxl(` zoWFl6cVErr!&&}XIy7;TYAqy87|TEjFKO+vFWYF39XCEjKWnTtqdL^#zP3Ns>!qjL z*=tUjI}L=#c4}+l;T@(Vx$)CbQG3-yq)d&s#N*dAlL<^x-4f=gTn>IbRguEa-&rfvExkN2F!z3jD5pk$DCC}o1RbXLooL~;iP%z9 zQ!Da8s|(PnGZ|3PJlt+Mh}vU5wXOOBS{9?Fi=cp*F_wr9^y8<2R*NlmWanUpbM}jj z^ci<^wZ^Xz_}RtE6v-Cb$n|d;EmG3=E@xpjcd~sREe)B^BDn&ZD@&!B;|qp5i5mOb z-dndl(s1yaV_PO(Rx zVtIbxYP)-a##38G#)Cub_bV@R+>vD+L(-%HQ+$~;)a1uc#xx6PpIHsJWIa9wlR~ zjypMQJFXut5ne?^J?Lbs!f7z$wGcpi*vF1U85(yER*|z@FlV1jIdLkO=zHAwt=A1N z{-;l)&L{z53w*~kZc9+;qv z;qvsUKT=>jUd4W<+bJ{VBk9Xv=C*e2En#VhX9%VK0BvE=;ewvT(~{wue!|A?Y1RoO zZEs&ljwhZf3ieZpvCeBZaVwMl15O7Ympze5XqPr*5kFGQ3TEF^6b9soYc*iz(A{OQ_qqXv};Mh#9&0io1)?fvGARL#XsugGf z<8#LD{k7GSl>+X9Zyq{D_VzOS2CPND-Oar{Cb#JwZeo8X!rP9)FFICGZ!I&nC%Xa9 zK|>pKR-Q<6GARH4Tp=*tuNiY@j)w-dhqzVKcgWtKBbBT-0*rmEW8YFR|mnH-6;?>HWHfrHRGvf zG*dctaF@3COsuiYI7F7 zOL_CJZ>9T$CnrDL>~ex<)2GGeptCH}so-$0k9&ZMe60c6Ay3kY=i{tG0Dgyp#z4V~ zh#W_Y_I>d*rjnn z(#X4U>tpv?Th%|@7@p(aA9wQHwwS|Pj$VqnyKUNlEz%P?aR4Pj7<;9) znHHHbBNB-ZUcBX;r-4>kEw1bDJ3Es&efzxXd54BE+ZV%Phb|*Dt*9FS`4ll9q33Uh z{6L(!=!QVUJL&4z;5@tCE1U&0S4+X~GZ-V;Hu7-H+FLYI^$c%w@q%ls>*k;)E_RmN9eWS+XE$M# zSxWz?TR(>;nK-W>CWI2!K%-MnX$rz~t!d6#z#6KRtvpoCYn4D@$d9kPsg!e8evu7r zUGo||ltsVAnX05Mh`ymPSWXq8sna!ScPwdvhMlWzX#H}LZ_gb7XTag>&EvbL;_5U$ zT{IrLq^P}3`0OFX(E^$bAwmTT#g^GMMN7_VOme5Y6 zCnA!f8JmRe=pyc7d8(CkNWAcllnHd`MpGi>H9lu$tkrK5*Z%c|kI}x8WFfdyd-|1< z&a>6_u7vA+EQ>!7rC{v@$9B>75K%~+EV>TLAehuCpVOk1|ZF~E@sg6Onf$)YjrmB zP_0zVj!}N_gh-kUwrOb-=mq4(*^Nhfx5$5KD7G9vn0?#e2O@#BRgjxNe>+vc$I=MZ z$}dzJ1NrQUfo$V*I4P}C_Yq-{kWs~LE=n?_g|8(%c!ei>ns>k4w8r3Jg^jZt2vm&` zXr{W(A9R1#S`a2L{p@?R^#>sKsy|TV-AAY`5MZ39?Rnu@UX!3S>S)8#t5-mEyr*y= zA!x^^u9w0$j!WW@4c#kOT&e2e;YSX{wiRM;7znFdVEr3(%{X#Im+%}OIzM*J zRa^W!fo|vcl7c7rrd0!@aEk%%A6g`4IerhM;9 zR^YhqX9Ln&=1l-bdY~1DG~^XhNIe>XGh^cYBs^V-=m+;YxE*4EwDF|-`v;pWyWNV+ zE+Ehi{KIx#kqA>TA6Ak17(E?8Qw8|ARa;!fpJ0av(G`6d1(QC%F$|8B zysvt>B7o9cW5W?0$D@YWuo&-H0Oo1)2F>_4u~QaWJXid42SqP)D%sQ3TsrmjV$;yU z`MaMQK+3ao%hFI{jrq?*HSdQrA~k0BjdWi0Y}T>d6)Um9ExMImR5sqFJk|XCexah5 zP4=v!V`~h)VyYW~b}S2o|2&mW2P279N@v12u|{!q#0yjJBaalL|r19LMOii+la*SRT!$^1ns4<%hb^f%U6o^6oh>VSEPpvl<+|C8ds{NFN^KsjH$y*f zGTepmu~=B6YhY>hRHMR99IK;;>r2MrBNu~{oo>BL@^Q33%_AAma)nnIjNA!j>EFBw z9RiU|od+sRY5KNAg1Bd$wOpe>*)|nT_IchG*FD61;mi9|t#XULS=~V#8ZCZ?xQi#9 zL2siqH&v_AvguZ|e!aInro=tV&FT6_cPDjaJ6w~CQ9b##Jkp9*`2wh;SJ?l3Dl4p$ zqXTrJU=+@Xei!b7Ir?xHa%@2UOwpb_b0&Vcc*wG(EbYV9h-bTRY<}jZ%(D9U?^4wK zn`v_Yy4e&Vn?od2K=tmKLi-Uk=|&LINo#xKXY-_{6wdj5hs@7OX6AeplY2Y7^-hC_ zGE2FbOQT35&wfcnPR=1dE`P2%e{pRI=UljrP&`5?K!>Ri?rUwW?!6Jyrt-BZ@FvB0 z_hjXR?Dmm`cx~~cG&d5>F$wdqIRK}jlCt&7fWyUT>NS+ZoAUi1uim$=AR6aS+}wHS zlVVF6nW7!pmO2Lo%4XxOX^o;5gSg%vo6$ z+=VgJCK^@F2oj^o>5%!%d3|4WsW=@GEiF$JT9)F;y>$!PS)0Z=i|FXDR3HC=;G2n? zu9oR6VPY|Orc)L(=hAU7kX7n_c7Y3cAmsd0`k`7k>IWc#vJ?W_WqA7a#>2FIVV=Rev1&_*WlOGA4u}m^PEtS2k3t zsn=l~ESfv1rw>GubtoyQkM4T|1$zaF!)@)32s7d9ef=Kk72#6QRX4j%_g zBjS!bwirFXknM;j#e89~lf*LUM`@?G7pAWBINKw~Q9^Sr9-v zo(|?=sTYDK_SIL~zVSD@*3YuN)UmzpbS=rP^cnX>eW9RT26at^0siUw2-}EzX&iVq z8WM!cNadD$ScIq2aZ)>8dBIKoMRfLw>+GM*&VOj($w}NF%Fg|4e^i}6Z=)XzNg-OQ zIHXyEy%VP^fma=L5;X6*QX#Mmbr>|{42d$+1GPmzvwR;U%=gnZJp(VrH@u&!MY3XN zeZ6#TKZTS!q|O06OKODKE!XPy#q0g zXhdA~93AiQ*V;1^J3YFS!T;Pdp-E=>YM|Mhuy2w$TW>Zt$IG#?f!b^Vexfwi@un~J!hGyK)4M`Wp@Kroub^U+X`|^0G_x=BoqEby|O_)wdk)}ejjuwP8 zl_cv_6cQ?l81oTXvM-@1qZCOcd$NpOlC0SoGq&tAmNCrCr{CK--Fxmi-+LeT-0wY) zd+zV|JAdiXV?MLI-mmxT^?Ys5*M&1SgZ*5MYZ2sk`T2$UvCUJfhkb19hJv)$h21@$ z&Rw=rY$Nby6%s3H7#~Gbb>;pBp06G}k#a>*=j=Uh9?1JVg}Aj(=Lzr-CsYuZy$^ae zbQ&-&lUlc)DP*qd-?GOtxDYyTj+i*1cWF&dlt+|G5>mHGQJ%lT%(5haQZG@X-{Qz$ zm!F-Le{7o{^gc3aZF0w2#Ah1WTYXLcILE}8769(YQ*R3%-!i1{(gqqclJpV8**)f*4&$2r^?PF$`@g*sV0I`!i zp)oPB374Ex?b{-{y38sg4z1jDH*9As;ziIwV*~=x&!xo=x<(|fyRnqu&v6SzI2?Wj z^SX}gTe;so+k&~P zKlI4Cc&w^J-Q%@gdeYIXDSiAT++x$nn`CBYX3u!r(P38M=;$n3wQg_H(NKX4#%Hb} zxDjg+)pVk_`g;0SJ-D0>I``+CH;q!*Re4`_TNi}lvSr`>U=GM?;xr?^!J#Au4+ z)o3IBMf2SXqaHQ~8g_oZHW9RCX~VNQ0j{=_ypOH`3fqarbf!O6aE!FF2zfibPUf+6 z&(iwrTB(Ph86WWtiEd&A7e=WZ)Z1pznu${>#Uj*{3cp-*Bl&<(6-7f#z zEC6yZ69R!1no?6W9SDiuG{l+_EtvP)4AT#}DH=NPc3SU3b$+onvG7;RFP!1orzJuB zn@*Phdp+uZ2psUD?P(-RZ$P*+VV9}9f?hyXSIoQI!nUYR?H-z+_UdL=$*ZTrpQ4{5 zT>rRMHr|Rp1rS_3Wp3?{tNw-%#cyWL_iNZ15-Z*lV=knB4a`5;)2Sbx{euXtq!SAbOM}OWA~>K_=A*s5 zX5_96LnoWAR8>+(+B&ZMzCuU@+HpWA6U`!hofv@M+ZH3dGXFF%e|rpE%sQ5_d-njN zkmcZBK?$gEzL&TsH)RYzAwd2X;&bb0?$D^9EamvV&DD~(BD=a?9!w9$ zoo1mE$2VR&A{Wy$DH_F_V#ht`;CXv$YI{!Zy`gy9a|59~rX7zEyf+LH1YdVg1{hiP zxyQ6Ul1YR8-Y}yL-@d(bK0hai-dwRxmbdq)f%i)d?<#~HRG|o!=|f+y)7@Y((4h6> zob~_n|HWkudIY9xRzpsx^=w6EFkP-H{$7i-QoWi{ONQ%saG{GvacIXS+2;M4osu8a zLtaScA_x`#?wky-GF_HRZJ;y+y_uAWTGQWzein4DBRx4#%>T5?%1sg42!aw`tpK#c zqIpJPsS?hU5}899BMI*h7U{X<-un1;U!2U-Am*#?d-N@Y{qA;$<#oErFR`SDEZsFs z+c-K8R(D^9+4cL2Z{p;0?k9h_EiS_luAE*vI2{BrPg_8A<6*Go+C5`&VVN8bbif8G z#XybP*q?aZCeZiDbJxkB-IDeU)Gw$n06hX7H#_wcIozvsKn@4}$PIS^e*&z@3|po) z^^>moh7ij2TDL?i=uZOWbRMF1ys^V4}_XUGJQ==I4jQp)r1!Lzhap? zrUUUV=o;L;l8tmsB~HLSu};6~s#Kon=QO3Zn#h=k-nV|he$vqrTg$RY-l|^1yY6`N z36t*H+$B`V^~;6^0kc7!u4`0u#PXcl+xEp_jplTUB}e|ZG5LS?xqq;G==Vwe|E}-C z4SHkkPyqwf)H0M9UgO-+>~RnA-r|>$k1TIrN^&kYKEed7Lnaf3v^ll`Xu%a{IytyV z=a#&Qf87-sGjp}ICo4bu_90BamRk~%Z1xXUz;Vy87#F`G?!N^6&mEKj6lc?mle`oE zHo4Ockj)0_`+!M;OH~8l=wbvxlg$UZuT7qC_CRw__1V$)NP`Y0FyLBjKE<)7WU!-{ zI2Hs7{l}-}$~C`!@0OUyGG&z6k9n)UAy{1Y#E+^R2Y<|sr%o_ok9QUTx5!3>tm zH-up+M>NBosg;COWpe0+6~yeC3i%^cV$fW%!O51K(btqHGWJ~8Ed~9|v=utWrMuUR zJV#i=^_ZLnPyoJySw(hlDQ)wJx$EJS)aCjm_F3lZN~cZY=K^~H<8lAU%eRwS5`|hQ zbOtDbQ&xf=UoFm~n;h;+xA@w2Kz*xve_waPrkfku2kS_yDG}_)XZl4EOFqjpJ>2{i zuo5ji7_H=msnC>cnKyX6f}X=UEoa)MlU-nysn6Scy+^{1+iCg5p!Ggsx!iFyBl_4?Jl64khbh155?+(C$?!#{q6@DX~(-6F%6+PK3-n=QlEEc<%(-wJGrm?^dbM*OZPwT#UISA_pcDSu-Tj- z$hLv01vxf^n^opn8OQlOKVWt-i*j1y1TMEKzt?fVShLx)k@u_kKd9t!Eqbx`K_88s zK?lw?dN>@|x|5snZ!ce0fr6l!<9HlcS|)zv8$z$o8-pUO0#;tG5st$wosuT#Rg!0v z(!U{`K*3KH5*S6=;TA(d-Ta<;@c?z=1o(t6^&5iYozpNRPROl5&B#6n z!w1mlaTp8s8tOkX;S_vBr1TR)O^Wc_PV#=l(TvE83Ju_vx5IH@J4fV6-p?d(XGqz~ zlb0^h$|vhdegs8#HG_7=*Oh+4;a6l%cp#)$H@CU3aa!rS)OX0VU6;;Kwqj(0z% zCXP6kV7;1>;xx0B5QIcRD=sJXF4&zq?SI34q4L)0F_SojU~w4(f30xOo5l}zSwD_^ zaoUu2O?AbwmE6Oi=Pew#>eG|_<)`shUqD$6G}z1dJ5Gjjs)JHarD5n~9-#ac*TMymW?DMb*2u*OYL!$6}QV zVllM_gM#{>6W8F;%2iktTZHMX>_0bTnrpsZ)5J5_03Z5UzgI=%t0}&e4d7-DIl zKcgCn#5*vLfVyLdsQTra6Kxam!^cu1pr^H46N#m7 zy9UwmF6W8cG-5@*#75d2wGX?|J>s449^@))g6JE{hY;Rt&rO4z5D0hWNlP$9@(mHD zxeYJPyaXxNwyNj zv*Dgt442fd-kKg@!Ru%(jBtAXBsXh6GMKnh zoj(OTpd!`*NXM_VSIM6*3Nto!(?1Y*XzT5lnx_vLOMzL|N>XNi2+1*xtw(n|3mRCF zi1GKa&#AZ^W(szrq3ojD#B9{t+isiK5BIzFHeIvhp5vo}g~O`u!u^_$sgsfv?bKjc zQYrVYS*&l0MbRy~i7><$u15s>X_|x$^M!Eb#ZhB zS2A$M%!JBQcF)El6f?P`9CO&ORxkPt&Q`m>N^c93xSx)-OCi@0Ad_boXhfK8(F7M?BGzFFz)As5 zX21kI&HMrM1RY_(0DqH!@mQb^htH4ihD|wj0Inb>Lol`rP#KbGhRR=HbJ#N95F}wm zD8tltreO*P76@Fuz`_*5tD*8n4v91}4dyzT1<4$bI!r7CK+vEs%Np6qA=C6#N2;Oj zmWri4Pcm?pcsU2h18wF7cW*67f7J0yusc{=dT==RN%g~9+h5sh>(AKB3tOtitZTbF z_%ynGY068DrHDUD=dB{%N#th+P@h~;vb@q|`bPVe&cXcMGuyhJMIAiUSMqMI0mI2G zQO2KQYLy^b2!iYRS=egP7JI3&w~CP~4zI+>tw?s6xjm43;a=sDH9D?ijso5JHgB}` zUS4zYl*jG`@yHVv1T~OBizVF!C-YZX=_dRqg>?iL3Yg>oQhjI*Xn?4yM~%0cva?yh z%}AlTMF_|@Dbwag#Jc~zmHE@H* z(NwlPvYQ1HV1BSBH#nZz@Ut4JXZ&7z^jk!O`o*2F1M8)-8it5HU^52XQNe)R*r3q- zx96<8G#?A?hKB%j;AK>!vRR~lu%bc?Z%v14a6Yjm82bB;J!uD?xLqm#$Sq(ld0u#E zchBrN?y^VfyOX?Ss_01>mJM60y~v_uMI5Ea%p{;l+w*BstD}r5HVQB(tLgC$$u)?L z1kn_%B+CZpjl~Yfbd}uG*wgOXL9HA}yI}J~E#X%e8C<@b!{s;!lx$#MaOSDCeKl(05j!c`b+XJ_??C93yIS(#d3s+lJ5{-2 zMDe-#hR4x)FMH9w3u@OoUR-=Sv>~8VX-u5ARC5F8Imr=q&*Alz623S}rH9kQSH1T7 zK%{yun|Lw1&g-;WxqzU*;c}6I??<_pB7SnYUYk9x6uzkul0+HsY3lWj0Py!q16D-- zS@l%DA^w}}pkc!uR1WW?a5|ysEQzONzw%uPz6@l;t8}}(X5X2T%B?+1={AmY!zI45 z^>(%bmUz`$C%Q&oM^CXHzq@*T-^mgGZq)hql$$7|doUw}^Gf{?=xs#bAqds>ADknt z8VkOhdN%ig{;7(qixCku3y#rux{55w8s&pU!J89?i(f@|^*I#O-m5k$H&0w*Nd596 zSSkN}6U0b=u{6KKb^7Nj^-p}%c6TF25GP#`8%o{)qwvPjrCIg_MN(pKSfsieTxAJFWmlmVJJj~ztL689Z%O(Edw>rxI&?Qf- zVu`SA=_55Lf7P0MCEMRTP;ykLjPt;-4U+7h?yC}5ckr^*<7SNJ18Pb6qc6vHy-rL` z6aEyf4c5Q#0bCnQ;9+lS1>4u9ubQ99E6&M3_dvin5OPj2isu8aATA5;64uS5TQb$>VT@w>>BzYDU)g(NC4M`<^i zfzBk zD5)O5sBW_55>tQn$tiuhqnh;lmV)8d`kh7zz9+`9<3>im65oZtLYAZdZ5o%r6306+ z)95##Kvhf?G=Z5!^ImBj4Z4}&7PkHRj@j6Y=PvPV;B79+;JfF)|L#5kMwB7$(_C7; zvCpGhw#|Z#rZgR2mLA-R+n7rf%=aE*x(UT4x%)3@A9~SsB8w;u8_h%^Ki2(?r;YhL zo|f$5@GF_!hJMCZAd~hR5$*4yRR0`_#FIcu!w%hbVwMrw_@lGs)w53@6*fzm6zV!z zs*`nfYY)v$=f+<5X)D+$fifF(B2QdR#?kn_%#Y6Y!B5UsaM{^P@GU!A%)Pf`OEtdI zOlP5l)UgHCi^1c1%@&oq2TzxV?4M06pq6+lQY|D-Sze&8HMz1RwLprWxPEcp+}oDH z$5S_PzW`Sb^*@`qe+j2t9_`!1DY*IC#JFvOpCV?`)stxm?e!VIIVona+4gH7;_t)| zhUkAc3Na*mlTbjMuWcDY0sZ7^9a?Z1CLpCPCdb@ffWD)0RnJnuhwjmOuMNiWjh}a_ zGH*^i&TX+ox%Es|P%i`ps2}KHttggEV!Git$ruOJMz%FGxF7OscHf|ZF7tSsiJNpV zz55(RXn4409YLqIbkb^%Gx5R=lU4T&XRX@wCY_af1}g`4QRExEJU9Q+S!8VeIcf^S z{deJITKJ?O%lIoo}le(mcP9JZqVNPS(U9PO!t*dTeY1^ypa9I z&EN1w*Mgoiq}(xHVZDnwJu#i?GTFsAkpx(~Flf?c^rHT2kZcQ+*av2hMEwwi@_)NB zhNfd-z3^a=VdQ=RjjA>B4osJl7NtE<%_H9s`cfIS2NPhOTxgQakw&#?iUO>U_5(Yw zz;2_#&of1mQ$MDvLG~GxiF{n3+Q#L);#;AI?BtBLch<1>8Peo5XBV*+mIhB6fnrrY zX08X>^l~BmE?5<70I&4~3;k|WiC}228N6qfEP@Voqf14La9MGs#L|zTGLzP0yRypb^f|?(%Z}(1d0~Cc_1p4&M5=`jHV?&|*-(n$rD%ze zXlFK2K@<^&%kGP$kSFE};*1{HW7BLp6BdfHPK|gYr^+@?^|PPRwIt$~z;x^3qjrOI zblF+mRi?*{A`Xoqr|h_sfG`L!m_-u+b9t-W)62o+jq{vA_@viqIhJ(u)U$^s8}CMB zy0@&HJn?W8H2Q*U14!RTVyO9?Fy0^OonFIjnB3v#G`UJE5vCEXyA%sX?qJ2T7RBNF zH7hq|KWNrlJ+z?ucI{g4O`VUmwjgGZp>M%>nvN^>4y@LuDS3s+lY?3h?#jLwC7CIv z&s%yEjIP30Pwg}1ub0(No89*=%(Ic3bUESi;cj@10ToOtUM~nPhxhD@xQ$DkmGZn3 zrSX-!5$cb-C2CBZ@TIUqHFvgAS!Y^DN@SCSA6ev_&rh344s_{2>LX^zOj-HZcugtz z^HSI*=k7}7y(dkCns{` zC3dad{mPHKFe_b*CsulOvP-u2zB&YQPircW<0EE5d0bat4q>hP{!bnCN}q|>_B>DA znYa04)zQ&HCzpUK#7Xe7!%@&>NBrg!>W&_~`AdSK%URhnG&HD2pM`6HD`=d0!QDY} zaPFcGFz`jR8gDkT|F*g{ox3_~QB^Z;ScIx&p_6q?tEcu5qI|jU>kb(E6Tn9d=2IAS4}i^fY|(@4;YZd;M|u(a#p4LDt<#;e+{o89Ix5L z-m--GFe#kLlXkcOKzs+IyC`l*;K85& z$&_(bvB8{Ll-fWBOK2X=*%4K&9tZ23gXlHD9O>!^GimdrCeEEZBZg zqx{`F}V2SY=1WOeIDFxBCyV+~Fj{_5V}Wl+af`-YI2rF}!d zRdU}DLoT@VYr?EhEFBJ9Qcqy+pXVS89Hzq0vjK*O1`X*DC+s~D_FYfV1v|(U!kNM& zlEW_H3w~co2RxtSO^3dBSW7t27GRKy)cuCY@=}KH8{l5kWZ1#BFlYvSR=H$BWFp(= zbi3ynsju+-pdNF=oB?u-Aah6{&giNG>u)`k21x!L z)0brU*sDFzA6P-S6?15)Ob;4tQwnFTwI?qfJUbNYNE+xuF9F5sV z>)njf*@nd}R7?%Jp136#R_EY`-2{goQGRNb<&|Ucd_#O#_|YK1F74B|h1tQ&Up#!! zD;CM=lL%abi;bSKS)90%=}J&I-l>`aw`9~Z7hoMwt|}$XrJCtJBfziK)9CRK$b^I z<2=@fum!69Kxl**`?{gYDuM$QS+%SXq`%Pzm7n#Y2$U0L%vl(y!vK8Erc=Mv2Td|i zhnZz{SgtA8O?L*vr_&*DssSO;oX!(1)Z6?2X6rdu9UJIjlr^~gEW{XU>BJltlLc10 z>RAQMfm&hnJ-L|^JHT&!jbpii@}%y3xIbyefC5bdGo_+Xl(Cxi7lZT#)BlJ(a}y;D z(95b6Xy^F+8xioJF`5NXg}&_!bPt+2LnTc%aFn^{t`rYYR(NZAaSBU>+yNEoM0W<0 zmy+kl$;`0o?J!uC+Vv#AFS94PfUNLzoosa7x`8k|?))ZLsEUg2tUJzB9;idhOeSu~ zqrqvbU0v1@tX)V$MCgv3JEW zi;PccJ6fgVq|FqsiV1F$kbAnk-Pq^(@>E5+N5jN4#aS5FrSn68rE1K3&WjBzqwc=f z-VODJYgFG(LZ*OXrhN&<(w)bAy6bTJ(Hy#E+mqc*oujLcABepqIYX5;SW$NOp&Hpo z)4GFufVhG!%gH8L9q}0d>zOP}yPV3YpguuRss4Ag{;E(5i;AHPhY1mn zlJB@xukP)~y}WsF6`2QbL>p%qHNl(kI9lZ`GvD>6*JF=+K5!CDdROpZ63v&jvFU^A zzHwhL_NU>_O_7-_X7M|`2Tk~ztKB7ana^otlM)A($dvQb{KRG*F2eblojiy_r6KEnX)+Yj;+!dNVe&_o0?- zsHE7Qy#hNC3%KqE#uX-!?t??HjhIr58_X6;Nm6@Wp77~shbNCm>idZv=J(KB5%I?O zoZ6*3h>aUCMkU}Z!{A1On6qDrH1h%l=VzvqMZD0?))OByRPH=?@lxI$C#h?ud|d4+ zH4ljH2oGU4e}2DPWq2o0&b;FO@F3C6qaKg?L&h?T+(uzpSe67>v6iPAh!!VT3Dt zTnnY$__u=s|EBT!UpOf%fo9|@As<|{3LPpXCbPp* zNZguxzxy02tOV7H)7v@09&vEAo>ij3H-zMF408Z9YDgyYvE4l~J~5z<1=ui{LgIKo zfiIK1G(XKcGj}<&6T}-h0XKn-UX!MAC;kh&AGCXX7d#J+&5wl*Z&kn*psCAWZb^%k zJr<7u9WoC{?aR&}%P$>L%=mb4fddnn!f=e_@GNmaw`JZo-=so9>$?DkU~zy77MZ{q zU5o*XZ3{BKyyN3kr!oN1p`I96EG8VVg zu=!>qXH{xg<#V_MjpmZwGd|xtr<9n$U&w&gO2A8{nE+%bozceRhqbc-GZJt}7rpji z*}UURl9Nj;qc+VoOb764dDda5qs*55MsC)5qgw???*eyb-#=nrF|s64fqNwJ7dSJh z3C=z`l|#f#$OB;X^_|=t5t&yBbrV_Fr)`KcvqMY7+wEUR-s8!DZ~_Tlt4ELH+-kwh zQJl3z;4Q$RIqQ8xWQbtq%-U!3WSFz_&_mk1rt+A5+sg0oQ3RqHHY`deq5&pcSa#|M zzHs>c;FTJ}gI-nWJ08M-bA&YGAX>xmQ=I0A$=#?sK&DR$vx3>_i`6)OO~I(v!0#&X z0bE7^z-!;t0hQSSB$6|}q>5UrIeOJ2U=qZsJL>5&aMgGQhv$WD03PU7ltMqm5}v@! z9X9E|J;Dy5qXy!J8$zl8P$Fc^ss+t_zV;o7#&CS0K~Lr)cw?V$2$k`LqF`A20YuZC z(X8Wzr;$g!Agi=yf)`vzcB4X^`{E|c;BSx(4o7k=)Q?*_6Q1xH3P)kTB$9C(Zhu!U z-Usg1Fcu^&7&16WEuMq(CiTcd;ed@)x7mhV7MJlgyg+UWYr&r9L_mYubX`uEYTQf^ zY5T8rB5f*{9&rDAZ`1Akitt4YM-8r>BQKqvUm95feaOQNK=-jJwP0T&2RiCijI;Ad zEyJuM4Gv=LFf5Tt5`#)oqVrOhm4W=Xx{&x+U9bXpKKeC>NScs-34MK${tH!bUe*K? zpb6kR@D43Z01@soCV?ZZsKzWwo=ie^=>LOmOvGP;obQnIAO0DFcY(9r2f`9>5CqA1 zAlP`O$e53`||dQ0olyH+AJ4NeVYd}ci$FG!X4s-a*w z{8}H8jQbqC_n-aCWf`VDLv=yaMP5l3-*Na<`?$IS2rFLV7z^~~Jf`X} zWIIq;z{DQ_2tiaLeO6;D1K`W()64MF3V6iPGha9>z0!us`8DiNA{S}G4ZxVg{zXO8 zu=cxfn!${%CYs4V2Sv8F_uE!~$9`hC>WoFn5>D10dM}Rlw!RK2Sfd8BVF~bujnYRjNq(?|loB1w& zfC7E&!IN(~pbp!U;8#~c$56MtjI>ZhVzw}-+5|z2lex~rF^jt314u1^&m4S*XZSuB zw%rop6=RHP{o(tYsgo*5GY|&IZ{jowHc?v22Z^#)kSO~ap8R8;tdSL4@|A&XaPAa_ zkCH)Br(-+~0Oq~zgUd{Fk_SPum~cbW7W{R$3ZAunZ&H=`m*8lagoFkGLV;K^Ezs!E z+$HidUV{dprQ_J=wci0733tV#rVO(l8XO|oVOb)(zae6(2#(YZfZPKF6^`m+xx zh$9^iv%ckj`O*P3#2FCPWRGzCaxom?RfK2?qa%EIa>PkvA{w^BlZ>WSIbsXg1Q1|9 z0Y`4HxgX4Wb=ks?!`NnS!R*n^knRzRZm)d!NzNP0U}uRGkZLiIX402+wCOvf z867-0kBgiDI8dLB1c6Fqq9pqUyOaKaG|=z`w!Lm;`uRHyyFq&E%npZ8n2Bq+5e>pZ zB=jL039p8AJI{YZ^isZtXN-SQ#Ws*SJ*So<)hyqa-w>0kY+^VxcmcaS!=oQLd%J-? zFuIi8Jx*!Z_Fchdk?WvZ4hZ@x$EQa?Hk~HS^25@>qc4_@|CmqTv>@$OJ5Ed_Wg*#t zB|Cwd*PMn=nG9O|Po0?8Xe-(K*Qo9Q6ND=us!N4_3hJy#7X8>op*ptn%)Plz3=IV$ zgZ%VKod^a=*cn$+_5$Z_E|w!G z_)DFbWCN9u1uAhEgqQ3R5aSNX=7WTo+P*mw8YlI;;kODroR*B54aY?F~mQ(ByItLiAqMj7$xZ@O}zZ{dp!`4fHL?Z*}x z_C`TxaYJKn5bd*SK1a-?ip<^x;NHCqIgS_b1P$+?LGrK*bhQmO1}V%3P{qM9Fb0kj zhG@^6;T_@g3#e}hO)zE^4L$gSUi9!Q?yW2{p@N)~Ulkq@Ppm`=xW^ebDOw&dFnT?9 z!%{Wpsfq|CtntR64Ogk)(gxk|wD|M8sAJEb%T4A*Im~GkN!wrCS^X^Ejom;>Z&O`% zh<_BU|II0~o&P8R0HFN}v2)gnlf>4byjHGARLG}&0d&Fh&)O~&#((Nr@ipY-iYe|N z0@VkvH$Q!Su+2AB>E;(=vcS`4kE}k;V3{^pKvW!7-tY~v<^fb)gqk-3=!7UY9k=)h z*ZB@N$i^)Ou=mSM8iPK73$~yD^$9beyOb6UbR}k?j$Of0=3^6szac=)+W#o$CE$Vr z2$uFRfDNJw#Et!H#Ee@gAumI|96P0#>iQ}lw?8?K^0f87tCcgm;ko7t z{JsLF5-qp_TK8t&eO@m|?)DE3@2LBB7pC0aFA$|muCe+uEgpbm8Nr`Rq?jDuyVA6b zFh@4}g}Z@~*9pF~SD~M6$|rtsz!P-Y>r~~p3hoE2m zTzBZ_-wOhU>}#Z2CJ9g){PGOKe?Ec$hLi9jMi{;TmTBB$8ADASreO9^0m*EFU)p>_ z%no_Y0LjDke8B1z>81@eEj40;IaUzV*D8UV*{*|-bpOVQEf?HJK zXwN4g?VhU9zf+r!&^wc`y~o=vVw1rN0Xqx0=9v}@*YQGcyB_=TS}@H6UP(HvmWwiCYK z=57f=%;OcXBJ}bhhP{ZxlRjboh~r44u=cQ##hk!>pkT4``$YUv8S%bCcO`xXp|dCwmaFKkuNeOoXAw1F=HB*s_-LiWp_D?)Kr% zCLKR@)uEJE;puWY9n4^DjvuVye%SIfawVbpfT|`j?TU$Lv^hGU%|dV27U{?LKCcKr zp=uHnTDx~kDAqA3wO-C_0f^7?^OX*LrJ?%M%8-J=udUM?%+I&=;qPtc{<(C1gYB?2 zq#a=xFym;f0SlB!$#(XI9J*_LN%(NEH4;AM+crhF@{jMtQ)-HZp97O(=QM~;P@v~L@{cYLr z<(KLb*KtBh_IBb;X}C&aAW3338!POaA@x2#S9E)K=H{IyCqIB%Nt-*=UZO9mJR93HJL@_aL^nw1VeA@dOi-2PZXFW~KYtJj} zCKy(_h$U@%6i9C`M&MbFED1tuY%JZ(9R3n(FR(MSnnVk!y!AqaJNm0!X&m1r{^|LdU6KequI`3!UB7Ko+JMuQ1JuQd0l^*3D7EN}_$96zna_mcKB*QE3nOyjg- zVj_E7i6ZP(m@1TCi44=tF-eN zM&D0PpWpK@r#Z10TMR#bg~a|sx^37b`_v02>16P! z#Nn;z?2FpScQHMOdT{FJ*DmZlmFk}=su+_jXj2r~xmxx*(8q=N)?{k1bBEhNMu01E zW1l-eBP6qFw-V>__&Lu*NxQpke)7^MfA-Qb+9rb7cs=u5Ig3g?Bd_l!Th5*d?R5Dc zUIzWMmwE6F@jIjGf39M`#myX|xiGYuPx^_WUD#?&AZg2Mb@Kx;sTTH4rn; zh)Y+`D`XDZ#?S9nxwTc^H`Omd)`??9I`BDOaTldsUdhJG#4;tdQWZ5JPaiL zos_nIX5ZzXtohV2uA**_)cughpDRbWbW;U~tZ%6ESQ~X;OqM zhiYzPt1%ZYQC_H9r+Jll9(nS*Vx3dR^T%P=3kndf8EUBe{j?z$z-WWtv2Q$Z#BYD%-UnbWpZ_ac;={3oI0 zqvp4^AMG03BC_FHY6M}@)-m9`!G_-@r5b+CiuK@D4w*#Ep}6YdIW4J^~y@kuaha5T?PMEUTVcL*P(wH8X3 zy_%Dvi6k^FX?#wn6(>1QX3H(`oGAstRrt` zMjxR;0l8>F_k_~{*yfwTrf9TN%2lbo)z2S>H*$k{B`}j=i|OJ&%z3UZ>^krbv6iV@ zsd^w-d@lBEw!O`ub z&Nr%lhHL5G@Al^Zu}-T0S1#~O99v)kK0aH8nFp<};5=;3aE+6)75DbQpNMN6r?%@| zj4O@btg7mN)+n`n^)}j}BxEqW@oS$b4Uql2RcWG_8jh>-13(>g50v-%CY#UPe*G)# zenhI>DYv0Z9&%Ah>&_#*Ff3g>mh)0wp6a!=jGP@;%2BA2G0IU^7M)6WKl`!1?eOb$ z_e!>H=nX+8#P(<%$rS0Z;K!601%-kMU$c?AnIkGi8P-5jpCh9lLcaxUt?_)e~! z&q%Q!o|mdw^!Ia*hIb`{5oqWVNGL9#HqCp<&zq;cNp8t0R*{~8+ugQj%VzgaXhnV) zdh2en;aCzfQ7Jr_L^BGNqYTwlZy^Zd!c|#0Cfvc;dGv*lj&({%&5nb$g6})Vq6p7}>P-23mM1f4bF0v(EqTmOiQ^&VNZ<>tS%QNVDc6Zgs%rjku zfvbpqP#&fl!-sbQbE38xU9*(W;S?|D$_qTcukB`tZ+i=EbVA?jam|WdUH5ABjh82# zJ|I|oyfr~nqt8p`thp~G!?%Rj-3rHih&O0I-FK`^CAP13V7o%toz(aa)`Vo6w`MjC zeU~jy!%{9sQntzkUd=)Cj;`VE`FL2iYTfOq+eobwTRN7a?6?O)l>x8WYsik{W&(Bc z^~VOm>zm(b)^duemv^=t<8lAg^fYSq4yDam{->+lK&ST+xL#r}CpDL&P)VeZ5G($0)GHRsoVqFd`D;(Gp!;~;k#c9O`_Cp20qw&O9ZEqKW$CFD9L z!NuN%3T;+7XB)AZyXu~`@w&YqiA7$|6l84?+zyoR$z-~3IYwlT=nck1gv%$@$^k`% z<@Uzh;_ZiTvL1>|YF}*ruz}H)5IOVa;}!ui?+q-*ASb2BoF&iXt&mWpC5Aq`qI-re zV0LNFEM$Leu+h3KmEgK4bv@Qwu8Sto5q8{lWJVwm=pDR4lQ|$5#dWgE^9Vc-!MKA% z#&MkSm))87<1Y4W+uFYFCE~MATg1j=Ji@+E|5K6>tY#;Q_0dG&k-;aK={w^MZ^$1$ ztQfkd?*QM(^nS^WRdGYNEML#Sw8)6{@+d|nXF02RnHE=p>KdVB`0t*BwpzSSeEj`6HplLPbyeN7Aqg;%)a3~9&T)IYDIdOl(F8g3l8 zk+Y^Zf$aLE@C*Fz5}IRT!PD^d6}O*tb(+_6QoYhYmtDx8 z<-UKe_Ve{kP6vCgJ>$;eB;xH^x0r=Bjbyev)0%==%aoY;=;;^K^d;j-^w5mt1xDQY zV^Vo0q0cr31Rclg@9L;T93WJ5$!x1~O_zxNoSY z3tb#K_tMd_az#TqsT(x_0#jvsD7$^~Hf=wBmz=Qg&(sW(efr&r2*G3)@( z*_A5BJC$t^pamvZdD42I!5_D|2+hLa*D^irsSWou<#UwV=wqAj=j8eBN1Wp^91N7P zezKC>Ca6EPPq;0o4U9-*109Djnf^FF z#Rvz&wzh|%&rT=b_wus|6*Mk8Y5L&95y#_us9duRlkM~xM$kK^E@(IFa-keZNE#P@ z|2!}K>3ICQlFN_1#JJk*Iuyf3C}@b6#ScAVESsF~`{--Xe47HC~reY+WIj zRwNYh(bYb8&7~s$K9O^JLwD`Zdv6-bzR(j|bET12X#g1pc1L2@xL+BkCZ^IPDf9UO zH(4%TZqIgSXiNwNDXYCRe_a*h@;F(NcUKvho+Dj`!p+c=(zF0QX7V%pj1;|FGhJl+ zX_*I(d!AlZzd%#Y=)H7xUHTg>6$JMPNfhpaww6e?NdQ9CM}x z?Dq59MG37nyIQwhaMc~Wz)gz-)hkb~1dIkaeztRS3EveDle=^BTQtSV=AEg5N&$r! zXMT+iK4$?f?~O45F6_BV~5_!4Zq#wBw?lE#Cv{&FeB>-O9k#Ftp+wvo?j?WLd~_2eH2l7Lss$Z?6E;t z&7-upyv)~k=1Y0M8vX;e+kbhqii=4fXjs$-b=i(l5G+2UpX42r{=MG(U;Z}#+Xwz* zB)@7T8|gAPA8A-q6ejrDxahhFROb+_?bvzt8I%#@@~e!`R~jaXfhB3+2fI z>TBK#Gm`(!x^zRy6WNamwf6F_uErKdjj6wsT={vg)@YZvW)deJV}tV}XY&y8eVbUf z(lf@!($mJ@C#3gUAl$B6CaoUmaml%HY*nz}O2lpDN_6+u0p$=_#Q}J-iHEcb-S&Bi zPeJ$1c;B|iZPj0zeDt4eilB(y?Xk$-do44fFp#$%M>|=JVN~Wa)t*?uO@tjx2iGU{ z)N_OyYs!nMRjqE&rGg3t1JUr^G7na-yKTcpXvPC;6T+7GLfB3CnAi0t6tZ7M_IU<2 z=DqGMFOTkiURbj~;D*4#%{*JwC%RNFXo)$7vjvb;WWfgR%)e&Nwp~B0pnzE z*J#F`dn4~3#P?fpMQ^;C*NhO6wK$)P@R**kr^!@9*|rmf%6_?n;ef(-xTM|5QS!mD zy!O*a)5W;jWvkALR0w{YlK0VBgJ5RBYZg-o9ql+i_7RAsJO;YFCvQGf*P@Ssg_*DX z=~JlHWa z5qt189PtAoA!18_QaBVrfZ$%HbN3O3RTyB^w1&w!yJ{qC8ISW^CL<-hUT}{hyFCU@ z=5nG5I}K8H*(Yel9zc9U)X7{j#d26^EfEy$ME81$8ipFvRH@9*C zGZnPVinHn*mm=BMWaqx!R(s12juo71;2q-~0$Zfng3+*{8JZ=k2%5o#H4t9=Xqx2X zIZXNft&W0%S0!oHv3XClxIeOXd%8<>Bq2kU>@Ku`ubJ`w2in{|-xu!oWU~&$A}L#@ z?5gX*u)WMLM44Z%UI4AQBu=V~R_2B#k7VjF}?|x^X6{7rL zx_+QnzDR1L@zw=T@#_tJvpc!l;2nfIR6eN&Ddx^wMG{ayL@P)rjek&*<+cBI|20uN zsc6i*#qcFn$1Nf%^GV1XoYy29anAv_du$bY00kZ1T}I?r|3Nr1m0U8teR$*NQLzy3 z9Cv%EMW1(alIfBw_Hqp_nM`J|vndCR-p$a%7aSZ1G&Y%hgqG~a?pDbx zJT&^~YK6~o-zL$(GKXtt2+bSaZL9lM(my5g)5*R)Jzvd=E`^+UWn5l>eSGcXtE@e@ zhDy%%mhuM5O!~6U5YM!ah3B&A{! zx?+dZY6O27BWZ&Syqe=+EmQ7{8o%)}FZ(t%QgHB9@+bGGt^?uM#}|ek4YUF-olano zicIaZ3H9;o=Nq-(=h6M1c9k@}i_0nO^Jc8vGXVDk9GCGc@W%^oaoHhfrLCxraq5{R zGogpC950;5jD*C!l}e?BK5AS=>d>sy%j86si0UTmZa%Uj;&NOa!s+pFWQvS9_5n_j zYqp&&a9QLP$sXAmVA^y6JISE`5GZ)6xLIvdBk8Qh^%A{nm@ena6)2{xMDPVFHi(>U z65x95thu~Vr82{0-`fxF&kLHH$|NYjy4e>}LdgE~ z-4;R5Mnm2?9DCi@9#m_>yO7}eWW)REuu>T4PCyhxOhAuyaAltW2~E6~v&Ri6lEv?j zojuLI%x!J+AWYjRd@U(!sh^<_-Dq|7W*m0dVx^L0M$Iy7-^(B-Cz0F5vvRw%%Z2Xc z=RvN}hpUA%CjqVZOzpt^7r95{rrZ=7wf7%#w?j`_XG5RhoS|DsvA1y~Nmspb(Ua_yaHyG<>7o1s+A#)l*C|mxXBBhUfFgB3CQ0n}vU7df6Q}@^8 zvz6(r0?)~*ADR&C8#s1ywGDyd5u#MxckSBE!r?=CM+`*@MDE7V9D{tm!aCb(8E{qG zsK?4X4t?WwrghEr`-3ODjsVQ!22qA4ift*O9*`;l1>V6WFX}dMnQ;lDm0N9D0W#4? z2heeldyiZRs=7Pxqf3S+x4r=noFV(L+yAz-`75IsQAdG~?u5M(KCec^}k z4XG3P1$hVEbS&T=J<()(kj~3}j*fFg&-MSXrddwMy~lC>{sMAoA%K_P^^A^~g^jAB zQ$qHe#L;&Jlr+`z4AdR>1f@31R8Qrzuj>+|h&wZ^b7LMcMm*xK?72M??<~o6Hol;> zH`)GHt-nfM#-~U9c9uokwIrnh)nGKqksw}L;+~eCLhHQ0tomteS*170=fJMa7-x0J zja#fVmCY{DM-v{pAim!gdpk>%uo+)D$&r5tb@t|BZGyzMvD;}gmci#-wIVT)&(5#b z_ndqgVD#!v;j0$hYrFRq=QFhC=t3Wwu^Ttg{5V2(GVbVagOLUOH;l$UKy3Rm zZL)m~5r-t38tQM;5&-+~+z;!ZD!@FGrRx6f_c7ln0Ox?`{zOf^`U!AUAHWf0nd!;h z0RqhqvSqZ~5ZO-eTsJF!qQ2rC>7iBL(^IxPMY&(8zo=stxl9bGxe#a`YxdKKs^(@` z6qp_MwyD6B0-Ysxb8~>TxWJ-Es>+{@BPZ=$U$$#^_IJvcmCDF}Y)*xL9{j3b7r090 z)ol*hN5Cz8@p(>mbm+es)KfIns&G{yYr?dsGiBnosQ8S^VJ*ksT{lklw{gQB`@b*^ z|L(w>01P!eAOO=0KS;~C>g&a`IFeX`8mU|p$)M!Fy8p~O)V^I^EXw1X`Fz*Y1B~m6 z4dQ!vM-k4k6iNR34l?qXB;3J|8n_tDJ@d&GA zr+e2|uMFyc4!l4`5mI<1?$x{oyU3rfFRQLUtd!Pt<9Zo-X9Hqx1}(_IND<2-i<7!H@t?qQXBQ;B8v z+!{~C%tcAbVKG}GU%^hZLYEEr&^%(%bz4F8 zXq?TbyjSNArrbr{UdHPk_RcyJT+QM9&F7I+Dc|LLW*#O|Sfu$t!?aWE(>ZC+tHH|- zTO%Rf3vgi~wgodHGqeApS}PVd&+i~IC=vT{&~P9xR@(67L(|)LvjZI=4E=-ey^o@i zk~dQX3w+kSLQnwJZ)`KFFd@IaQNsICgzS~JH^B6OQL+XhvzCuJ?B1usVo_w z(V)hPtz8109m$nA5o|8sH~q&b3gwo^$Xpc83CF((7aj1bC_b_K<`my}N>iiCrIKvR zk$K%c;l~?4b01)wvzH`=5aBs_*2oVO1L_5<*VT#67Q~$ocHQS@oKi`Ine%plsZTY9b{dX1_l8F)ec9^COOcu3I!Sg|EuM`btoJ#2re{w70v`Z@t3TF$bZG|%uHsch5Ay4c} zI}c%}%8k;N>D_*k?ZTyO$!8g~7uT;gC3pvD+1qKkwLEKo@kXpRiybvmESnr2+f#89 z@n6DT+n+{H>F0R;AD&8oJ00Zd2ILI3RUUS$>CGsV?u>R&vo|@Y`S6f+SK(y4k8JDX zON}@D=3He2g-UvrmQI&GFbaq%Ja!84M*JIVV)22_Z~B9uM%+W3P?`jn1iIoP96sQy zh=FmdBs~piGV8lo)%R^URXRpg3D51FKyH|UShWykcrGj1opv<~cWd3AthI+FeQ_kN zzddgnYhs+4|GUi%5b6JvaZUXn7%qUEFjkZZP&H{~S_kpAm^g72qC(0u@{LW?R51_B z)G4io@&jADN4#HhB|RD(CQNu+X*3yCUU<@9oJWxv?EiLMu;4O&PTE6xbzxib6}5q^ zjYo^3kiFF7)1J07tIcnY8hdde-H44&C8s60axCqbWMC-FX$B-$v)#En?WG=2BQCcV zZLd%DUGLVshi?-q110Wtui}1xCXj3VEuVXY{3);hyFvV4%sT$gEC!wkpoSw50423n zkkNrelUBG)Z$81d^{L}PrS&ylBj+Lb2|of)MEsh^@++UCg^#AhC5p`4&$xZQx43X? zNc42+`LWN%#h5p8lbR+Uppv~5)!ovG{?q%AJLTG#FAVye?r}KsM zA(<;K(3OGF8o#(?zZd`7sMSDT`u+=#?Q@;|UcxWE+q=Q{4W*2=?BcVsk=1%si!7#W zi>Cq$0mY1mHUl)19{26s9?(ekzh^<|585Kuyu`V;k?5~izme1UfsE&DHLdoCHtUww z2igK!u^niBsbBx41z!+Z>QBwWg3~C4!F6(bKP#>LHV}i=r%wR(+2>$qYx*J#JKOcf z<0}Ut8;V6=TaC=pB6m`bF47suo;m2N9)s0>+>LIeXP%oy{s&h4JLV%;pR6^h&!j^( zFk#}t19d_K#ME2_)!2!ojXXhkn)GympMKf!^H`G$?~SJ$(zYB$N>3ucYTF-nft-r`| zn4z8Ci!H}3FH$a$?NL@UldSU^qV=#$o)ZLxi~=18FO@Lu{y`|>-s|%UW3ScacjRj} zXy*b6*%=e4VgpT$*cRlgVJmaVD5I`_* z0u7XcT27!5D|}Lf%Iq@bbtjAUxc1ml`H(Ix&9_Hb?MW$$0v#{rr*+=z;6npXtu75x z&WE%k`KYI6a6^={-Bc?)j{oB|FO>8%V^Nsn#S70%9!dr7e$#w4nQ6qY>}ak6i3L9k$@2=xzR57uFaPTD%awNcHQ62BY7i5ei*XtLJI5$U3(^XG-P7CmQ4ujO%uYE{aUs5 zat3{0tc`7z464Ngt@A!EWeZ)RA`bxfP%UYKBk3b8M$52d9#=jXc@q=!9m1~qVGdW- zSE4WzzVKF$;kak7-h}g~lk&Bn-ZIJF#zQ<@k>zR2$0+Ae?u4!u^vJ3~kJiB{4ln6I~tN=MW?2dX_R500a3bzL(O{gZVGLGJwfB*{YT{@zeCQ#NTAMk20i5^i4)#4%G z-cIsTkL-$PZmQ`|EAY_b@6{7O;1ZLc@G`7wMiAoIA<4@yMsGuxRkt8mQN~35WmSsd z0>2fK3w1tW=TXa*q!@4>tJ{)jyfD(B)yIFu^MuXR!3DO}m$!&iSXE|GLIpR&Bnm+WPjnYQakUu=wI!F}OKzif z;pcr>qz6{6)FuW!EM~+URd2)Y+w-ATFy06Az?9jRa*|w48YGxBYl=DIp-m+m0W*zC zU%lIc@*PZg)l4U1q!lHUjd?ES+};xy@x|-DZ#I;fTmbY|@wBRYC&<1;-6^YRIv<(4 z%{P0x!&1jv8p4!i|Eyat?pWmo1!3ZXUUTl0Y?bVT~iyi zXz@v*^1zCd*TIMb7KTDZ?1KZxle2|*UG~BZP=2&~b#lZE(#@CTJ-+;9UL3>2r?TCX z+h(fex`c(o)vxZM1#+)5CW7bppecQNU&-ez_PCZ4^Ee9=@%Hm(!R>cWrMo* zoOn-6hgXGO$Bs}&L*Zkhr84_N;mNdgFR<*!9ZwEaVn)195lC^*#>e$WEYqGSy{f2Y z@h^%m*(+6r_>vnUc7So#wq%$>dQMBCtB?ypL@UEa0-95RNNb-+?JE1YUE4L1$?M_s z%*o1YLZW&-E2zA!YP5J4-_4gEiHRv zb;O5MkJ0a4(K`IRUWAZ^l`s)2%sIpx|8C_u*VsF_zu(;*M$cvMa@-@0lDUW=n{V+7 z4Ur}XPI*87TF*-4-5gn(yjB&lbXWCb))eW3Lkh7i zq|~o9*2{Sb0(+feIhjme1BN!xFNE!c%TRZc#fr&@>9#1n=)#GB7jx!OqsvvBw#8Y! zy{f?kg9RnRyTfM#zJ?!Q{4h()0`21HVP?RQs`I4CxC$c@>UPcSrhf{Y@)*4zZhTNp z6Ou6j=LwS4I3{iDkuxM5Dlr(cGh`4~m6lKkny=W8O%;nw4K{hPkLPW5FL`lK&zwU| zg&ldnBzCvV%kNBV^Kf+V%irUPZ)JE)y^L8_fw7 zV$WLl-VXIEesS4?6lW1S7k{jCz${)Ws0V&^Y3ebfzsHo}pbe3`U`rxVFEQ{OP0-f1z;5%^Ip89E z;@LmmWdFC{32noOqu*P?=xxR@noMjc;v9fDph-A<4t5Pn2I1L>6|g>zyW=|%V;(=m zY>zafnL;pY8b8msWyI&O=98quiO{)mjjoAl82&5>bP z!CxrV-e_Um=S1=%z2yzSmJOejm`DQ=rBoQz8>D(kJ&|-RZL8TJ#8MDYPHiDf(i=>H zcs{Zo>l|23<*lZlNt~lM)u8FO`?DH4-RalioWEi@jEeN#`QSsfm%c+TW$FSvoR79Q zc=+2HNJt21l34s93iOwbysXCxkCtT+!f>&&kqY1#~0wovzw4f#6fEO41L$u zyRAnvw5eN)ompRz{DH@RrlvEeQvsds9uS-bqF&fDU<|nj7g4)`wkIepgDv&z5qO#m zo$(uui$n`9ScL7Qs)Dzt(sB?>7t85Zd|TCbG^YSN5Mlp!a|*z4{3#hsDFT4|!~j@=yb}aUJAaSdniki#vX0xgf~<=cq|tKbk`*K$11X zogM>P{C9}Qt-+_$8s4x0TM6WrYJwyEZMr`_k?*xrh8LL>M{A*5P-hflu&nLwCx4j= zSI~{B=+9$-XI*;l+;_-`FHH;@3AqwASf*0T|+kb(hx5z+M_*wUyZt=%W6M^6`6mBW_Kb3be^e8_|!xQ6^4-5$-^c zCAVNs5#J&H(U0M{lJj2?@E;!5wL}E{JxAdW;oh|n@N|+Q|Kw)T_QTD>t0ekQZWbL{ zM}VBcCLT%QDZyoShLkiQSNITZfMb^XRmltg0eDV~rEJQ)2)RXnhgg#!=hEB6jJHl9 zlf3`StmxmgAveAMRWogL3K$~LGeM@l?L|9yi{=xC_-w04#J!j4136Yh2<|a_qNGCG z9FL7cuHArDz-aM)lN4`cIyr&}raN#9`m@~|B{gaL-o%a=JZXVuD!4}ezVQ6p&JK_g}@H=|wn-@qe(~{c^$~Abq?TFnDlZw%%C>;SEA67<%aMU}dY#h{%b< zL~I)(F-zv=Q z11_gT=ZE|`xr>%%))S+Ky#|ji#(!geC3#mza;GVC7QGWuPDPZvgyxaG^T=~nMDfJ5 zGW)NK1^U8oV%tA7M?!WPJ8eTBJ^)ITHot2qWBf8nKG5~HVL55NekdgTO1nMQ?iPh$jv^bQ@jB!-zrgi^J!j;(lp38fT z9!Ur^?)5Y1hwz#*zy2To%3tsP`9qQI4@I^=6xsezWcz=w$d+H^W@)o>FX*xRk+h2b zLr*F*En-s>PsuiLy~Ag@GJe64eK(haff}wJ$mJszadd?rI>rCVm$cRJE=y?y6Ozv- zUu};mhp?dhaS#5l9)5$u@q_zvEyp0-M;NoFL$wos*Khkzk+gnI`v3N+=wJNmKMqv~ z{y0>zLjE{Z{gjn4{BfxI5l`0sUmvPe$0hZW^;u)tl#et(e5JaXOsMtbt2^xPBNwt93z8u)Fx0ujYv?{A}ZMFAY;XaYu zzbwPc;P*!~|D#0qM~Uo@64}p10>(c|WWSIa{-HlNfaNvt_k94o*BXG;+r2lC5G^V@ znSoHe^+T#+z^OMjUVa}}=|V_6md3gXBOd{k)#=3^dP4*HCso!h z1Ua_O96$qX-yslC5te`v0LOGEaFMVA?`%u}rA+~S5)yw~lCv-S^Lv9R>Zw2p-%cU?f{3_pjr?3UF!HFZVpx0zTn~OSC?s0$&DU1AO3d3Pj zS(WLMUG65e4^q zbG2o*XAND8stjl%CF;R%e)4H)XPd@_e!dGK)&BTkK6er7?bc1UmgSL4q~nXz1f|cLSl?3 z_i4%CHxqRp?uLu@zf^;6k$^XLsv^Z0nm>x6-%lXm+M=IRyWDBOabu_+)XIlgVY42a zO-o%z0XzZ-VeO@WB{t-50`j2rdqYBbzkYkkoVe8x*_3h;wmyQO%%6KtfdoH>ZY{%T z8wb7whyFU$yf^9_{>Cs10UFO4#FpwPl(Gl7`U;lJudiU9yE3lNk(^Rhu+t9$`An$t zVpiea<8o_>T4!dyj&A+1dYn`K+3K;1_RDX|O>H3^-~Drg)KRB04`jUBn${8p(Ql>E z-E)5<#kP$~bCYoChTT#8t{N|I-G@I6ynaUqdH9I_~mT7+mmO1=-9aeK%C8bl* zzg^~ko#}r*l_0-<3Os3s&mkJgYBX^FE}eK@2@{+DOysx@7GBOru3g* z$6uXsjv#ES9V4qvuOa4x0qk?85`FR#It+Sh3+82vcZggiA5^5Kx0V8w^bA`7Qz}`}~7xySnpko8h_t@mOpkN}m^w-AYuU?1q&*&a_O249eA^_dH@t4rO zw{&I`7P2b-y0eac^pX-AbPJzC-tR9!%e&LJ`~%_xw6&igKHP85tFGQ^Xzrhz^ABke zEM8(Llrah2OU~a~*$zsVDgiChL&;tj*hz4YI9Q=7D z{xDr_`(e8J>+1h$CAR-_SJhD`|6?T%h8d{?NRP#LQ#Wh#hd28EmwbvKW>egML7q z@e99R?EfHCiF*@1f&IX1Sv{8tMksmuSrjy_e*nleYx)N#^c*nx{F7%?;}4V12EfLU zNkb~o7T5sElx#Vw;^5E>H2j{rDc zI1&=__FsjGm=%d8bVCd{Krb;qLx0umX8yAJ!<1A?tqEb=MUzX%>AgdLMw-k4++TT% ztr;*2WQl0f96cj!!D11+;}_h?jot|ClKgSPh^1Xm(DSc6){7`~Jw7;c+Xzvjr2rgc z%|W*2joA3y)onxU!U6tH-;CO_i$B4Olt}g`m|GF#&0W!)JGpeH8K3bUwrkha{jvqO zW{*0|?Vl)30);;6{FFF z+ltYnXUguG_0Fly`qft(rXC%8s=79`ThjCOwPM;>H*+mDm|X2hki&1r!sRB1iWdSx zI?nJFp83KSGd`gHcrv2;W@6kUQJ0)-R&+VKCDU;R+lb|$S`JY6XNhDq?^YpxUG%%= zccNU&yM8Sub0MrvOwg41DR>qz8G0)MRxg8cB$p5u-`2wivg`Zm*rpHMq;ab+P2X;0 zv|C`Yc*rZT0*P;Shd z`ttP0%NJno1+OeV&WsN7GDHKyYIQ3D`pwt?;p#Q4+*;4r`(l>!X@0%o`Rd?870#ju zqIb)9;)y!2s{tO(=>qX6?fihkCgDfIiT4U^^#h%J*_`%A_A9;Ij_7+f@5*@gzZFEb z5TF69US@uhU^>YV(zP8@xg$Wvp0F_z`7NnD=1H<|Wxv#({c^L4on8WvnJ@6Cq41ZW zv3*9r1;>VBC%e3$ni6jpsnGbx-t9Y~^`YRwaxBX__1c&7enRaHJG+^nOFcx#R)i>= z#|_Ivb;Jvz_WHKEZH)5bJf5buF@DHwPdK`&85=M9w6&`767$x?HBRgaL*2gY(Gk@y;OJ8{d#z&VU!|?fGB|zK# zM^=rBG!uP!7L>dN@Iwze)!FP(BCT%<6+((USWmM-s& zcjiY5Uz(jWY>;u=EBd2BpN*D+bEkNXh$Hy|teUf+Xr_R<#cEjwc= zR(@jkf(&bS>!wme;bH$wVvi8l3ItZdz|l+$Xh3DjLTD$!*J!QJm^64rBViSqP!6ho zX>Cne-!=N^wW`50jJy4muB0p+Zi|HwSIK@|OXUD~S$PeBmxs7kP;L89y#fyd#6l=L zkl=Gj!^2n_RW$T4BBwk=dmq|cWIp(Dc%}~YNZ)<8C^*k1 zZa7yAxq`AIpTd_UOhD0$0o%Q^bB4*&a!-weq9Up9+GYI})jtz;#9@G@HB3uI&)W|o z2tI1$Zcyj^_rJjWiYiSu#N)z4IN}=WcT~Kt?!A;{n_?Un^>FjujW!>jkx)$y$X31` zI5Py$JLnPv-=LiCx0uQ{(=+?4)ArQVJ%4H~TFc3NJXOnd7wkVa&YI9jM5%uRHbPvb zlDHXmGuUj>syz0><>X)^!@Kf#&XpB6;?p6UMvyJcIdG{y^bU=uIX(sBjG{@^BZd^uV0q?m)z!SHyfJ9y#a|%r z+N);9dz~53ftl%IaMQ(W-p)s#n1|Hc&F$FY-g{Xxgm9t#0NhbW;`D3A-)e+eH=aq^m&w z&HpnjPqKtS$ufBKZT&DRgPV_M9@|3&lAWDO*Q{XzSXG$&-Z~-fZZ7y-`Q)F`5?i^f(nN9S+`2S# zNo^K=bP>1Bs+WsOEvntGYXd*1TnfimpB3z8&g*9G6b9lZO(bU`Q#rZ>blwkSt(w)O zu1%#`r5sPzmv3h1zLIrdM;cYrg=7tiV*Vg%GCDyZ`IEen#{h-NyBjN0*1w}qJXhl_MzsaF@V@Ionw)9mP zmirmj+Y-?iyM{-nW#YbhfdiNBr!Sv;3sY-r11hZ@^^$7t=7wrNV`Td`toHvFF*u^9 zK>992(MBl~Q&VQTSY2;bXjCPrjT{oUH?s!qnGWX(0YGB>c=r!f^nXph`NQOJ9NoYx zDkKfNZKWELa2IfO>#=}@ZRD~h5Ce`27T}Bwc)n_k{e^?@oxkuS`e+fLP`etUf*xWs z0H>VmgLBrK)OcAGZsR`MA;2CSO`ZzSr_o>AvDuX~*brU@0uWcnG*{L;|xM6V*D|*s@FaRz0;+8NGKUD(k?GEWmby?|n9?A>9@yLb^*x zsjuAQ>CGIyv{CE0djW-2E%r-YQoAO!K;C-B_z8VUam2hCR)K45r|Qlai+FhPxC%c? zhd=eqU7LC3bTVl_SF#^>*{(}{H;zDdKp;dcS%#UKh6PS_t&3>dp-<0tlr()P^e`dI zVOow|dm#}Cjm%+qay=Jush8ZI4gInAMx!jqr=Em#OdKYp#b}+%byyVsQfBL>VK%0J z=)?GK({~92mPIYIhBVNRVO_SjV=3-ubb^^mEJk`2v--?nV&Vy zH#+E_ObmT<;{+ISUUJ|Ja&(c@T3ScWCk(c2GrvXdF!Ytu5PMX0WZWe1%Mr6yy~yDP zC)v&vv&PU9yCDA9vg&13ViH|>ak4CJ`M6hp`%JO;c!|om`&;S6LuQ=MKl7bBFV*_? zl+#HOi`!3RAO1$0|-|zHYRP zk4R+cgt*ZMag|oh2*vff{m&H!=S$mh;d?Kw?f2IauYD)lII1-Up>#JW9{Q4fGQ3FuOX2 z>A=iFI$>) zAV%+zG6-Wh))3)^8pnXDiEWQ%$JUz6l!~L<*B-PBkG?H!*==&5j&YgkbkVO#U1_yd;gm0}(G{zFAi>+D@}+Fl*7crNWCMh- z8hH=hXozrAceP7o2Dzvj^H|&G@+lu%O?^g$Q8A8?- zqr{Jr%V(hEgR4!Qh?%G`$2{`j+ZqdcL_uZTEL(57bB>C?fyRO4V?8g`S0g{RKxih} z&`=yc&uS#P85fJ#iFTsF7hIQ77R7n==SiZlLr;wZo^(Byv@c!k7Vl>uMJJ%Fr=a2J zJ-!J3h4-oAVuZx|PHQ=W!+Q?!t2>J4v&L#B?}DI?6INS+W+bS)u${q9M>kE1kzJ-t z!mi~fYwsU0cAZPb>$U8*a+OMIaB_0luPV>c-v(j)iVZhM`H@Ws)h!wzl=%|{P7KU- zKYdk?0ZIZV6SWlt`GzO2oV#{425$fGZK5HNRzrru_tVEGtL~@gV?sCsaL37K2n%fa z<$WQ2ZepFEW@OA?KTdiOAKLL$h53tV%Y|-ceZmJ)8trjCGZo!~b1@lZ3Kp@6Zuw%> zRIkuCZ6KMfCoQSZa;HdGr}VC)zP^*pjl=Gcc{Dy}vZAE9Yq$U|NRF7IKP$CCE?P0$ zyz~eVb+>%ErQq3-thdZ^@nh{w6Z`Pe0mg@)P@rvNmktpTB~32Pr&v^k3yt-z{xRP;=m+6Lb>$hWj857C+6bt$gx+( z;ZD(V&BKw0rd-GL?9V+&DDtV@6UyEok|jnSq}YUX;41RN=g|dnBiWF+X+_`I<(d(Ml#|kg$=I6| zr`txIuk=Ok^zfymD~q!-TS-c%L_k25t|F~%bp(o}rtG$oalFY>Zn2(<%EvpDjBxd<)K?6FBu$KoQUMM>i#eKUf_71Z+* z`)jsJgo{`^PF!o?E>WKmg9wl+X^-J*Unj*-JQP`aSDMYyh-oju3urgvX?7dcWX*sN zXLsnmc#$1hc|ZAFyL@c!v250(lPq9Q%$wjIs2JBhOUpySrfY>1a_vv_ne|HCkw4v_ z`HXW%<8c}H4?^kI6reQPtJv+`wu}{wW(f??}|W zhPwm<=5AP-*Rtl5kY2>jzG=^Il~tH$ZZ|rerrDf)KHH3)3>)srz#huIa`#GRZEUhp zvJTvt5Z&4}sz@3krp5Rw4$rh|=r@@})x+wJa=Hsxo*Of{ga?~2S9;HM~f zQHOc46XFdMu9pw+$NRrs<)oSs^&HxcS17aaRlVuxIH9{Et7A=RuQtcxOu zL8Cs-9R^iwndCs72aghH$Su4W6y`|2IX@eCX=kM0p`DnQGpm}Yv6t}&?Lz%wd`^*uj*RPU7UF-Qxo67A6y zizr7gbK$imo0#manrK0Z zf~mIgG%T7fLA_3{o~niJ`M751p2hL98q*QJKa%>2QGVw&>(ha>by3FkQhaDIXru>j zl!!{qvH)R_)QrKmy)Cpwavd*mJz7Q zX>>FW{}j8sj;{daR71Pw1eRr z^U@TN)HLI@M9HCE9k>W~%tg)UPkny-D$|F@6-2oWxn__0wo6Iwc?4QrtO#kXB|v0% z2nheEVu$CNM--c&Q8kAazYc7tEI;uVuv&TLH4`&ra4zZnvjkI(7jws?7|ok-75<>} zUXkif7<}tBLOI`~Wj=)#Ien-CBXHqFD*fa`w)NYIhu@b-8SlNhOE;RMfm7!gNaI2> zh_RE6=?<@bXA)FU>aPZtmOM9D4zZ4_5-VTb>q@^2_Z2@MdSEZ=y@AvfNH8A-LDc{f zF`Cl}Y8S}uloPISc|sZ~=wR<56%h|>RBL?Tnb{{SlLr^qeL!C}4>4SkYziZomqgbi zdev+|v!JI$udfBNR=cB^bmEC0!@yhf#G>8WYYf|5KQ|Hny(XDoV1!@({Na=HTf^3W zC3gG;qpSa|gVrppVy>x#Y>O!0#Gn|&US-Xh_b9L@b3K|kskMlvCWzi-*m9Yg6db4b z!gk}FF~c^Rt*;N03r(Z#PNB0jFF2X=5Z?{EdHZ0l@G@*hQ+YF{>>LoZ`>G!jG}_Nh zhUY~z^Sj0+Bt7SN;2q_;He$gFp;8{UKom@Z5n+x=Wu zDBXp64gT&G{653C>g7g?nFq2Q5vF2}FJ>LIB%sqFmxw2)YS%`nqP}Zx>Y&HbpEObi|0lc^q2EIeKZ3|7AAChbF zGGPeUF7m<6=2ZwvSL1H)RL>z*KFY3!D3hbwh_eAeQFC?4rJD>KLnDkDFJ$5*rP?Hj6+rRXoU~e#hNQ*8_bxc_ zyos6>>D>QuP2`R(%Evn6nN-sKL>5>7ySD()(TOO_Ujk_CnSez+Gmw)3fw8NWFc-bz zdBxUKoSjR?mdQHvN;EoMNjK`F4%j)12_+-}a^aN82m;u`TQy@x z*w~wGq;tnp;t_~qX6;HAHUZ*|+?gNMMrUmzhL1KY>nW~KHQMu6_g?F`b@k43sC50u zC&Xc!P=pTPiEMvFFRha0RrAx1|W#u0k=g^*KTs`Bye!8#1 zgQjbfdZ2>h;pSW$Xu$IFIro0QXz00h3lelJG30He~uH2;W{ zhjgh;Uoyjzm*fEXfDdI->87yBufMArQ-(;8s``!5*TpG!!-=VW~kIiiQ zrlZ*9bwMcgj$<|yP0L!T$mj~iaa_B(UpK!xQYFdjWjNoiCl?jPS`P;U1Bk>r4|t{` zl^@!0WT@t04q>A`fMI9RP(Nxbo=F`V2iAN zt}QSXCqm&MHw!;$dmPpTy;I&MTv#2aRi;Ddt3HuS7@N$5Xwq3#Ahc|>lcgAXN+APF z3GRKiBcIB@!#JQ`n0hRAjYod>w6*Ys#>SicbCU2G6^706yw1umMQVQL`l@xrq9jA1 zUG{=b7r6R4frlGGph51AstH-Px-|pfCuc<52mv7%6or+9PQSZ3^u;bwZFkR_w#Bm@ z=x{&FK_tNv5r40X)4!Zhn#GL{d+%ld3QM82(AH3RGkK1wz?Wb$&0ZBX>MMjgR4Dy$ z=3y-UB<-);b|Zz$~A>IVnzNMAq5E`&ulousu*}aC#)0>SX>VH%0C6$97w{Z0KXO zyVuZSBdlV{=4_>Ji94L17#)rN%r(ufIh)K?d+SL|%Ea^U5RK(4zCg$$btmI&r^}4V zat39$yUhtI`b{Y)%CwS*@;vsDrD&$(`%d4M+p4v|UVW=WF^bVQWzeUy-mFP2C1OZ^ zZ~9Oc;t`I|9Nx*RK8@G-+Nb&{NzLg(=n1(KWGNRHV6lXx3+a-;ZRdXL2Z+rhBwbsF zgM5Y}Nyd;YX%Fct0cL}3#DL`WW*h!o-^)a|W%omK_V5dB@2>mDEQ$UZ?LPC6`p& zzJp<3o87e|rp)uw5EQ5~?m8E3o<=eRvJ6MU#eB{LjqM48J~;2v9at2M_-~@-A+K|!}(4XIeLk)-aRgMMXlx~4y)1M{5UQ`6z&9 z({uhFK=w4!zC$E`0I^V3S`vExxgoegF=N2)od;Fk=*WpJ2)R`f*$008S6ES_1ce z%FKkyxqzf%0C@7pMg1{Sf6S>rR^J~X;g8_?|D$aNmCWocE(5A30=k*~X!A z8o}qzue^0N%d>8vdQFdVfyu6rF`Pl@Goucytl}^6@P1zq`kVhTe4v^Gf~g}wlPM6; zm2h^0VJnDo1l`U~=;~buaPw?QPZ?}Wpl=I^Rh_1v0NP1Lbx0bZx;@3;O1U-H5c#v8 zoG|(&nwq)VilH#L!L}8Isf$2sDJjw8I>|(=xz7I;HVQN8iAM5NjM`&MdABhb*FGGlb8vPUX;wur+pXddk$4E zqz&A@G_%vTHCyR3&+FY?`@;5y-tu2E0sgUZBpovGgbk4q>DuL?l6<<# zC__R($B;eVDUSVJ0Ju|V7P1rq^QnV8LrD&CBHr#i0?UfLt z$o;YnM;-AX$^jIVTu2zjTMRa=jsU*c+X-lzLc17uS?{r5#?vGNAl%Mj( zCWR9?uP`=kpob<&Z)h2n31)*NL-OQfLwu)^&kYp>??IPv-8R_kB({r?E9P7mAx@g7 zAVJJaTZB`Dpsvmdon5{rN02oVH&en1{$ zmpqP`9}R8BCW!B-B-E!pw;5Yl%(GP-@s+!fXPL|uXVLZ`yP!#b6R@n8&>qO4?-05G zjUGW1X@D!DnNY@rRHl?-J&`M=g=q$P2iIQ=%D-RCtR3UKI0gxX#h_if$OkrC?HTC8 zR7>(Q-d^Tyb++xopoR)Nsdmw)=>_+Mc0>FZwbmjIj^p46R85v6px+|E+vg|uXz}sp zp6Qjm=!|9Zc$gVysB#;^R?ZzD$OyRzw5Qr|fFW}aXs1p#VK@U2Z6oy&&m7!qX538; zbGuYZ<@H!{n}jaMM|F5M>~s|E1~f)fviDR;91dE!8dZO|I`Afz)A~(bTUwaZm9^xY zy`l3{_J%u|Os@mGW|JNCP9O%04jq6JCh>!9upJBHyA$0w9&9gh3oSW1OHHxYWhkBX zldHLs3~7JIgaM=@pfuaGIs!(%!Xzd{_5!(Pn$30*BVg~)F6HkpoX}F-&qgvBkjOew z^zs15f{xxkosm$+6%7d4n9dr99}hZNO7;Y0HM7CaH)OXL&4ZyFNm@O|JL7|7yI4$$ z8?vDK+9^G*uf^nT^4bAl#Ff1}FutS@qDq4mw%>g>hGs6)| z-#wsUvr1uZVeRmg+6y_qWhTc7VLEjrl{F`&DndsL-Don@G<^jpyqA`Xc3+LDa!@6v z-p^67W!F$rq3mH|ZMx}=xm(M7DnSSRDAn;U!zcPMtQ?VubM>^EirftU7R>Y>vCE_L zxjW?LlgYX&!rX>^b!%wbf6ndozNgBgIr zE)kfQaQTSDrH)cPT_+O=SG?KC^Bj*sTS|_+ZZHQ>XqJ@{zvkJxYw0b_M zobh(r%gHi*y|ZfItYCJ#L!cBS5K85%c|U1JPz#qD=!+WO3gYX;_K~R%=x)Ez8?A<77Ez~jRK`L zwM42I#S?Xk_M}qV_IO$oT%j9RE?>Mi%MmS#n-_hbz!|T&E%m4@LW-dZB8<^7s?Z4^y2XNweclbDK1F&M@0jvNjpJcsVK9KGgouBtZxflGwv~f9hMy_w0Lry4K78d)GSa-o5wP z`#azFoyG2p(OPMbeEQZrN>bmQG4?djhSsrg{^})L=p5_KI}Z3X%HmVo@`U|dCzOM5 z=e(ppZ;ZKNz#x;Y9@MUS(}WTrAfb-WW^*=LQ{RLxc>&izbic%9Nk_6@Fip;AaJR0z zCwNK9Z@8bl#m%iblbvsqanx5d;VlUx9v@dHaEixkUbffOkV(brvTfqzVBz!ZTe0O4 zt80hsvU5*g-IE?(_k9s%7I!MFZ((yN;`VXURltH{S}DZf5cu#=(GwkP%@2PKB zFk)hH(PypjhXn7K$hE!UY37abE=|3alZ83j?zRZ+i)JfTSFPr(&u%9=QAKv*Rb2Am z495IMG6^($VtQVGhH=p#^Zm2Cp;|==Q|s z8Lc;hEZX0xhMBG`h^qidpA#`<#G7ygBQ|{;SejS+x)rBTQD9mH9aX=0wC2aSoiJ(D zVddY3@`j0so-W9r&8W;qj+{)YX!3d@ZP=$c;yVdB{hI6%ZGU^saLl3w*WI|O?7P=0 zqXuJ2Z2s}#u`B07AW7Nu8~lA*u}LbMa$G%b9VZ9xONcpY;}luCi4wlzVdbVm+M|2b zwXlSF-R!=^(X0Kilg6gn2FIQsfOnKHR5Vv2sij`YBCYs|T%N-Fs0mU>FpuKs7_8XT zmpE;`8#mSliLckYjp<3UHKE^DJX2Z9&w5fe#zc)bti7}MS<*FTWi#FF6SyJDd-)5d!l7~mg)P5rLBaX zJ|PfeD-{qVnE=|`u~8HTn9{en_A)r&`oCWX0Ki1_n}@)@#{WUQL>thkXN>_$I7JxD zlbxZG#>KYP07>)!NMaAzEH+EwxU~#HbR@tVe+GD?e0B0~CyzUMR>&)gyc)~*iod+i zCZiPqakLN^zROlfzJ7`_PqD`dDc!W|=T66O_>~Z6eE&obU^XA1yP}OMT(yp8H=4ls zM?PwiK268^L=&P4dA@Ak|Ik`UkP?#+(#rnn zx#*$7{neiJv?+$*UXwb3?JAg&tv(EtWb^160pXnpeG+9Q_2O<^TyYC99XuuJGEL1^ z@}kL@9%yFH<9K^TR}r$*j+tGuQXxVL{RX)38OwHkfUG( zN0rcpk`IvmDQ9Iwz!KoB`vB1gSQ}IDt1QbNi0wl)#cXLAI8866E%gw}8+LzyIDbPB ztR!3@v~tBjI^v?!#t>VdlSIjSTYze2KstqfHf8`!l*|}n1xDvvG^bd*JCBPPs^%f7 z*Z6+@qWx2W{`m><`nmQ)G}AUUH5EfX9FI)$)1dVh*(Xl_Hn(?P`E0-zS6o{_>^4ia z^}!40^Dj5L+a*ehSZwWCR_*8iE*mGzMA54DRJ~^no4Kd9>g{#!n+~gPy!*?tJpPuc z1mWAs-(I5T&!M*9l<+WthXf^n@k3p*_(3-f7>zM`F!wZmMiYO7Vr{m)4e|xiap*S$@fJl9jLx~!Ull*2$&?ZV2Oat`ni>RkVyUIhZ3=h7I zjDBlrtPjDuP7;-2C6AC5?vS~xkCyGUbg{)}rCV1O?2fw{VeAeZTfJLftEYj0PpyDd-gWD9- z`=4zy{mMHb`t%RnGD%wO2gqw4xX8fH7Nu`1IdLK(aw7BcXqWBzoC0M9Qzy`T^Y*)A z1BonUvA0Ham9#w5tP z0alMi@Z>9|2{9%8Yr23;s<|T7Ni;uh;7_t1ayCO?AK1Uzqf@@&hVu1(htIM*`?qHg z|Kric-|2Oz;&gr~B|M=FH%tM@CuTL>o(^jfOCvk%I@jGg>72_{T9>*3mFPLHdg)UhG_jqp_6DU?nfBlA$* zMemw`z^Il^Kf{~KmmzPX{*@Q|$wK_+QTOu;!+(5I?MR4{L_B7JKN0clD<8@DmyRm0 z=IzRMOg@Ju6)DLUn1K=1$^~)L&Ih5fUrqv=ZlHTt+>0Bjn)w@YqNzlZcj< zFek$JCX&00Z8Ju9N>ira%Sp$~E-)=;^=LV14&1DLKFQx66sD4sE2rWPg@vx0+55g* zNzv?>Z~lPx$*^@n(X|CniAa;RI>>9zb{3-}Y932%SOa#_rFpXILmdixlV`I2F>ef3 zg=JW3{*6P|wqF# Result<()> { Arg::new("init") .long("init") .takes_value(false) - .help("Initialize pageserver repo"), + .help("Initialize pageserver service: creates an initial config, tenant and timeline, if specified"), ) .arg( Arg::new("workdir") @@ -53,6 +59,13 @@ fn main() -> Result<()> { .help("Create tenant during init") .requires("init"), ) + .arg( + Arg::new("initial-timeline-id") + .long("initial-timeline-id") + .takes_value(true) + .help("Use a specific timeline id during init and tenant creation") + .requires("create-tenant"), + ) // See `settings.md` for more details on the extra configuration patameters pageserver can process .arg( Arg::new("config-override") @@ -61,7 +74,7 @@ fn main() -> Result<()> { .number_of_values(1) .multiple_occurrences(true) .help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). - Any option has to be a valid toml document, example: `-c \"foo='hey'\"` `-c \"foo={value=1}\"`"), + Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"), ) .get_matches(); @@ -72,7 +85,16 @@ fn main() -> Result<()> { let cfg_file_path = workdir.join("pageserver.toml"); let init = arg_matches.is_present("init"); - let create_tenant = arg_matches.value_of("create-tenant"); + let create_tenant = arg_matches + .value_of("create-tenant") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the arguments")?; + let initial_timeline_id = arg_matches + .value_of("initial-timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the arguments")?; // Set CWD to workdir for non-daemon modes env::set_current_dir(&workdir).with_context(|| { @@ -115,7 +137,14 @@ fn main() -> Result<()> { option_line ) })?; + for (key, item) in doc.iter() { + if key == "id" { + anyhow::ensure!( + init, + "node id can only be set during pageserver init and cannot be overridden" + ); + } toml.insert(key, item.clone()); } } @@ -136,7 +165,8 @@ fn main() -> Result<()> { // Create repo and exit if init was requested if init { - branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?; + timelines::init_pageserver(conf, create_tenant, initial_timeline_id) + .context("Failed to init pageserver")?; // write the config file std::fs::write(&cfg_file_path, toml.to_string()).with_context(|| { format!( diff --git a/pageserver/src/branches.rs b/pageserver/src/branches.rs deleted file mode 100644 index 8a411060de..0000000000 --- a/pageserver/src/branches.rs +++ /dev/null @@ -1,428 +0,0 @@ -//! -//! Branch management code -//! -// TODO: move all paths construction to conf impl -// - -use anyhow::{bail, Context, Result}; -use postgres_ffi::ControlFileData; -use serde::{Deserialize, Serialize}; -use std::{ - fs, - path::Path, - process::{Command, Stdio}, - str::FromStr, - sync::Arc, -}; -use tracing::*; - -use zenith_utils::crashsafe_dir; -use zenith_utils::logging; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; - -use crate::walredo::WalRedoManager; -use crate::CheckpointConfig; -use crate::{config::PageServerConf, repository::Repository}; -use crate::{import_datadir, LOG_FILE_NAME}; -use crate::{repository::RepositoryTimeline, tenant_mgr}; - -#[derive(Serialize, Deserialize, Clone)] -pub struct BranchInfo { - pub name: String, - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, - pub latest_valid_lsn: Lsn, - pub ancestor_id: Option, - pub ancestor_lsn: Option, - pub current_logical_size: usize, - pub current_logical_size_non_incremental: Option, -} - -impl BranchInfo { - pub fn from_path>( - path: T, - repo: &Arc, - include_non_incremental_logical_size: bool, - ) -> Result { - let path = path.as_ref(); - let name = path.file_name().unwrap().to_string_lossy().to_string(); - let timeline_id = std::fs::read_to_string(path) - .with_context(|| { - format!( - "Failed to read branch file contents at path '{}'", - path.display() - ) - })? - .parse::()?; - - let timeline = match repo.get_timeline(timeline_id)? { - RepositoryTimeline::Local(local_entry) => local_entry, - RepositoryTimeline::Remote { .. } => { - bail!("Timeline {} is remote, no branches to display", timeline_id) - } - }; - - // we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id - let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() { - Some(ancestor_id) => ( - Some(ancestor_id.to_string()), - Some(timeline.get_ancestor_lsn().to_string()), - ), - None => (None, None), - }; - - // non incremental size calculation can be heavy, so let it be optional - // needed for tests to check size calculation - let current_logical_size_non_incremental = include_non_incremental_logical_size - .then(|| { - timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) - }) - .transpose()?; - - Ok(BranchInfo { - name, - timeline_id, - latest_valid_lsn: timeline.get_last_record_lsn(), - ancestor_id, - ancestor_lsn, - current_logical_size: timeline.get_current_logical_size(), - current_logical_size_non_incremental, - }) - } -} - -#[derive(Debug, Clone, Copy)] -pub struct PointInTime { - pub timelineid: ZTimelineId, - pub lsn: Lsn, -} - -pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> { - // Initialize logger - // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages - let _log_file = logging::init(LOG_FILE_NAME, true)?; - - // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo - // process during repository initialization. - // - // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched - // initdb in the background, and it kept running even after the "zenith init" had exited. - // In tests, we started the page server immediately after that, so that initdb was still - // running in the background, and we failed to run initdb again in the same directory. This - // has been solved for the rapid init+start case now, but the general race condition remains - // if you restart the server quickly. The WAL redo manager doesn't use a separate thread - // anymore, but I think that could still happen. - let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); - - if let Some(tenantid) = create_tenant { - let tenantid = ZTenantId::from_str(tenantid)?; - println!("initializing tenantid {}", tenantid); - create_repo(conf, tenantid, dummy_redo_mgr).context("failed to create repo")?; - } - crashsafe_dir::create_dir_all(conf.tenants_path())?; - - println!("pageserver init succeeded"); - Ok(()) -} - -pub fn create_repo( - conf: &'static PageServerConf, - tenantid: ZTenantId, - wal_redo_manager: Arc, -) -> Result> { - let repo_dir = conf.tenant_path(&tenantid); - if repo_dir.exists() { - bail!("repo for {} already exists", tenantid) - } - - // top-level dir may exist if we are creating it through CLI - crashsafe_dir::create_dir_all(&repo_dir) - .with_context(|| format!("could not create directory {}", repo_dir.display()))?; - - crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?; - crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?; - - info!("created directory structure in {}", repo_dir.display()); - - // create a new timeline directory - let timeline_id = ZTimelineId::generate(); - let timelinedir = conf.timeline_path(&timeline_id, &tenantid); - - crashsafe_dir::create_dir(&timelinedir)?; - - let repo = Arc::new(crate::layered_repository::LayeredRepository::new( - conf, - wal_redo_manager, - tenantid, - conf.remote_storage_config.is_some(), - )); - - // Load data into pageserver - // TODO To implement zenith import we need to - // move data loading out of create_repo() - bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?; - - Ok(repo) -} - -// Returns checkpoint LSN from controlfile -fn get_lsn_from_controlfile(path: &Path) -> Result { - // Read control file to extract the LSN - let controlfile_path = path.join("global").join("pg_control"); - let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; - let lsn = controlfile.checkPoint; - - Ok(Lsn(lsn)) -} - -// Create the cluster temporarily in 'initdbpath' directory inside the repository -// to get bootstrap data for timeline initialization. -// -fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { - info!("running initdb in {}... ", initdbpath.display()); - - let initdb_path = conf.pg_bin_dir().join("initdb"); - let initdb_output = Command::new(initdb_path) - .args(&["-D", initdbpath.to_str().unwrap()]) - .args(&["-U", &conf.superuser]) - .args(&["-E", "utf8"]) - .arg("--no-instructions") - // This is only used for a temporary installation that is deleted shortly after, - // so no need to fsync it - .arg("--no-sync") - .env_clear() - .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) - .stdout(Stdio::null()) - .output() - .context("failed to execute initdb")?; - if !initdb_output.status.success() { - anyhow::bail!( - "initdb failed: '{}'", - String::from_utf8_lossy(&initdb_output.stderr) - ); - } - - Ok(()) -} - -// -// - run initdb to init temporary instance and get bootstrap data -// - after initialization complete, remove the temp dir. -// -fn bootstrap_timeline( - conf: &'static PageServerConf, - tenantid: ZTenantId, - tli: ZTimelineId, - repo: &dyn Repository, -) -> Result<()> { - let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); - - let initdb_path = conf.tenant_path(&tenantid).join("tmp"); - - // Init temporarily repo to get bootstrap data - run_initdb(conf, &initdb_path)?; - let pgdata_path = initdb_path; - - let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); - - // Import the contents of the data directory at the initial checkpoint - // LSN, and any WAL after that. - // Initdb lsn will be equal to last_record_lsn which will be set after import. - // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. - let timeline = repo.create_empty_timeline(tli, lsn)?; - import_datadir::import_timeline_from_postgres_datadir( - &pgdata_path, - timeline.writer().as_ref(), - lsn, - )?; - timeline.checkpoint(CheckpointConfig::Forced)?; - - println!( - "created initial timeline {} timeline.lsn {}", - tli, - timeline.get_last_record_lsn() - ); - - let data = tli.to_string(); - fs::write(conf.branch_path("main", &tenantid), data)?; - println!("created main branch"); - - // Remove temp dir. We don't need it anymore - fs::remove_dir_all(pgdata_path)?; - - Ok(()) -} - -pub(crate) fn get_branches( - conf: &PageServerConf, - tenantid: &ZTenantId, - include_non_incremental_logical_size: bool, -) -> Result> { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - // Each branch has a corresponding record (text file) in the refs/branches - // with timeline_id. - let branches_dir = conf.branches_path(tenantid); - - std::fs::read_dir(&branches_dir) - .with_context(|| { - format!( - "Found no branches directory '{}' for tenant {}", - branches_dir.display(), - tenantid - ) - })? - .map(|dir_entry_res| { - let dir_entry = dir_entry_res.with_context(|| { - format!( - "Failed to list branches directory '{}' content for tenant {}", - branches_dir.display(), - tenantid - ) - })?; - BranchInfo::from_path( - dir_entry.path(), - &repo, - include_non_incremental_logical_size, - ) - }) - .collect() -} - -pub(crate) fn create_branch( - conf: &PageServerConf, - branchname: &str, - startpoint_str: &str, - tenantid: &ZTenantId, -) -> Result { - let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?; - - if conf.branch_path(branchname, tenantid).exists() { - anyhow::bail!("branch {} already exists", branchname); - } - - let mut startpoint = parse_point_in_time(conf, startpoint_str, tenantid)?; - let timeline = repo - .get_timeline(startpoint.timelineid)? - .local_timeline() - .context("Cannot branch off the timeline that's not present locally")?; - if startpoint.lsn == Lsn(0) { - // Find end of WAL on the old timeline - let end_of_wal = timeline.get_last_record_lsn(); - info!("branching at end of WAL: {}", end_of_wal); - startpoint.lsn = end_of_wal; - } else { - // Wait for the WAL to arrive and be processed on the parent branch up - // to the requested branch point. The repository code itself doesn't - // require it, but if we start to receive WAL on the new timeline, - // decoding the new WAL might need to look up previous pages, relation - // sizes etc. and that would get confused if the previous page versions - // are not in the repository yet. - timeline.wait_lsn(startpoint.lsn)?; - } - startpoint.lsn = startpoint.lsn.align(); - if timeline.get_ancestor_lsn() > startpoint.lsn { - // can we safely just branch from the ancestor instead? - anyhow::bail!( - "invalid startpoint {} for the branch {}: less than timeline ancestor lsn {:?}", - startpoint.lsn, - branchname, - timeline.get_ancestor_lsn() - ); - } - - let new_timeline_id = ZTimelineId::generate(); - - // Forward entire timeline creation routine to repository - // backend, so it can do all needed initialization - repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?; - - // Remember the human-readable branch name for the new timeline. - // FIXME: there's a race condition, if you create a branch with the same - // name concurrently. - let data = new_timeline_id.to_string(); - fs::write(conf.branch_path(branchname, tenantid), data)?; - - Ok(BranchInfo { - name: branchname.to_string(), - timeline_id: new_timeline_id, - latest_valid_lsn: startpoint.lsn, - ancestor_id: Some(startpoint.timelineid.to_string()), - ancestor_lsn: Some(startpoint.lsn.to_string()), - current_logical_size: 0, - current_logical_size_non_incremental: Some(0), - }) -} - -// -// Parse user-given string that represents a point-in-time. -// -// We support multiple variants: -// -// Raw timeline id in hex, meaning the end of that timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d -// -// A specific LSN on a timeline: -// bc62e7d612d0e6fe8f99a6dd2f281f9d@2/15D3DD8 -// -// Same, with a human-friendly branch name: -// main -// main@2/15D3DD8 -// -// Human-friendly tag name: -// mytag -// -// -fn parse_point_in_time( - conf: &PageServerConf, - s: &str, - tenantid: &ZTenantId, -) -> Result { - let mut strings = s.split('@'); - let name = strings.next().unwrap(); - - let lsn = strings - .next() - .map(Lsn::from_str) - .transpose() - .context("invalid LSN in point-in-time specification")?; - - // Check if it's a tag - if lsn.is_none() { - let tagpath = conf.tag_path(name, tenantid); - if tagpath.exists() { - let pointstr = fs::read_to_string(tagpath)?; - - return parse_point_in_time(conf, &pointstr, tenantid); - } - } - - // Check if it's a branch - // Check if it's branch @ LSN - let branchpath = conf.branch_path(name, tenantid); - if branchpath.exists() { - let pointstr = fs::read_to_string(branchpath)?; - - let mut result = parse_point_in_time(conf, &pointstr, tenantid)?; - - result.lsn = lsn.unwrap_or(Lsn(0)); - return Ok(result); - } - - // Check if it's a timelineid - // Check if it's timelineid @ LSN - if let Ok(timelineid) = ZTimelineId::from_str(name) { - let tlipath = conf.timeline_path(&timelineid, tenantid); - if tlipath.exists() { - return Ok(PointInTime { - timelineid, - lsn: lsn.unwrap_or(Lsn(0)), - }); - } - } - - bail!("could not parse point-in-time {}", s); -} diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 8b65e7e2e6..dc85c83c17 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -8,7 +8,7 @@ use anyhow::{bail, ensure, Context, Result}; use toml_edit; use toml_edit::{Document, Item}; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; use std::convert::TryInto; use std::env; @@ -78,6 +78,10 @@ pub mod defaults { #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { + // Identifier of that particular pageserver so e g safekeepers + // can safely distinguish different pageservers + pub id: ZNodeId, + /// Example (default): 127.0.0.1:64000 pub listen_pg_addr: String, /// Example (default): 127.0.0.1:9898 @@ -118,6 +122,206 @@ pub struct PageServerConf { pub remote_storage_config: Option, } +// use dedicated enum for builder to better indicate the intention +// and avoid possible confusion with nested options +pub enum BuilderValue { + Set(T), + NotSet, +} + +impl BuilderValue { + pub fn ok_or(self, err: E) -> Result { + match self { + Self::Set(v) => Ok(v), + Self::NotSet => Err(err), + } + } +} + +// needed to simplify config construction +struct PageServerConfigBuilder { + listen_pg_addr: BuilderValue, + + listen_http_addr: BuilderValue, + + checkpoint_distance: BuilderValue, + checkpoint_period: BuilderValue, + + gc_horizon: BuilderValue, + gc_period: BuilderValue, + + wait_lsn_timeout: BuilderValue, + wal_redo_timeout: BuilderValue, + + superuser: BuilderValue, + + page_cache_size: BuilderValue, + max_file_descriptors: BuilderValue, + + workdir: BuilderValue, + + pg_distrib_dir: BuilderValue, + + auth_type: BuilderValue, + + // + auth_validation_public_key_path: BuilderValue>, + remote_storage_config: BuilderValue>, + + id: BuilderValue, +} + +impl Default for PageServerConfigBuilder { + fn default() -> Self { + use self::BuilderValue::*; + use defaults::*; + Self { + listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), + checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE), + checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD) + .expect("cannot parse default checkpoint period")), + gc_horizon: Set(DEFAULT_GC_HORIZON), + gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period")), + wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: Set(DEFAULT_SUPERUSER.to_string()), + page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), + workdir: Set(PathBuf::new()), + pg_distrib_dir: Set(env::current_dir() + .expect("cannot access current directory") + .join("tmp_install")), + auth_type: Set(AuthType::Trust), + auth_validation_public_key_path: Set(None), + remote_storage_config: Set(None), + id: NotSet, + } + } +} + +impl PageServerConfigBuilder { + pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { + self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) + } + + pub fn listen_http_addr(&mut self, listen_http_addr: String) { + self.listen_http_addr = BuilderValue::Set(listen_http_addr) + } + + pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) { + self.checkpoint_distance = BuilderValue::Set(checkpoint_distance) + } + + pub fn checkpoint_period(&mut self, checkpoint_period: Duration) { + self.checkpoint_period = BuilderValue::Set(checkpoint_period) + } + + pub fn gc_horizon(&mut self, gc_horizon: u64) { + self.gc_horizon = BuilderValue::Set(gc_horizon) + } + + pub fn gc_period(&mut self, gc_period: Duration) { + self.gc_period = BuilderValue::Set(gc_period) + } + + pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { + self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) + } + + pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { + self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) + } + + pub fn superuser(&mut self, superuser: String) { + self.superuser = BuilderValue::Set(superuser) + } + + pub fn page_cache_size(&mut self, page_cache_size: usize) { + self.page_cache_size = BuilderValue::Set(page_cache_size) + } + + pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { + self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) + } + + pub fn workdir(&mut self, workdir: PathBuf) { + self.workdir = BuilderValue::Set(workdir) + } + + pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) { + self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) + } + + pub fn auth_type(&mut self, auth_type: AuthType) { + self.auth_type = BuilderValue::Set(auth_type) + } + + pub fn auth_validation_public_key_path( + &mut self, + auth_validation_public_key_path: Option, + ) { + self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) + } + + pub fn remote_storage_config(&mut self, remote_storage_config: Option) { + self.remote_storage_config = BuilderValue::Set(remote_storage_config) + } + + pub fn id(&mut self, node_id: ZNodeId) { + self.id = BuilderValue::Set(node_id) + } + + pub fn build(self) -> Result { + Ok(PageServerConf { + listen_pg_addr: self + .listen_pg_addr + .ok_or(anyhow::anyhow!("missing listen_pg_addr"))?, + listen_http_addr: self + .listen_http_addr + .ok_or(anyhow::anyhow!("missing listen_http_addr"))?, + checkpoint_distance: self + .checkpoint_distance + .ok_or(anyhow::anyhow!("missing checkpoint_distance"))?, + checkpoint_period: self + .checkpoint_period + .ok_or(anyhow::anyhow!("missing checkpoint_period"))?, + gc_horizon: self + .gc_horizon + .ok_or(anyhow::anyhow!("missing gc_horizon"))?, + gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?, + wait_lsn_timeout: self + .wait_lsn_timeout + .ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?, + wal_redo_timeout: self + .wal_redo_timeout + .ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?, + superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?, + page_cache_size: self + .page_cache_size + .ok_or(anyhow::anyhow!("missing page_cache_size"))?, + max_file_descriptors: self + .max_file_descriptors + .ok_or(anyhow::anyhow!("missing max_file_descriptors"))?, + workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?, + pg_distrib_dir: self + .pg_distrib_dir + .ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?, + auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?, + auth_validation_public_key_path: self + .auth_validation_public_key_path + .ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?, + remote_storage_config: self + .remote_storage_config + .ok_or(anyhow::anyhow!("missing remote_storage_config"))?, + id: self.id.ok_or(anyhow::anyhow!("missing id"))?, + }) + } +} + /// External backup storage configuration, enough for creating a client for that storage. #[derive(Debug, Clone, PartialEq, Eq)] pub struct RemoteStorageConfig { @@ -188,22 +392,6 @@ impl PageServerConf { self.tenants_path().join(tenantid.to_string()) } - pub fn tags_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("tags") - } - - pub fn tag_path(&self, tag_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.tags_path(tenantid).join(tag_name) - } - - pub fn branches_path(&self, tenantid: &ZTenantId) -> PathBuf { - self.tenant_path(tenantid).join("refs").join("branches") - } - - pub fn branch_path(&self, branch_name: &str, tenantid: &ZTenantId) -> PathBuf { - self.branches_path(tenantid).join(branch_name) - } - pub fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf { self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME) } @@ -212,10 +400,6 @@ impl PageServerConf { self.timelines_path(tenantid).join(timelineid.to_string()) } - pub fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf { - self.timeline_path(timelineid, tenantid).join("ancestor") - } - // // Postgres distribution paths // @@ -233,61 +417,41 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result { - use defaults::*; - - let mut conf = PageServerConf { - workdir: workdir.to_path_buf(), - - listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(), - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_period: humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)?, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)?, - wait_lsn_timeout: humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)?, - page_cache_size: DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: DEFAULT_MAX_FILE_DESCRIPTORS, - - pg_distrib_dir: PathBuf::new(), - auth_validation_public_key_path: None, - auth_type: AuthType::Trust, - - remote_storage_config: None, - - superuser: DEFAULT_SUPERUSER.to_string(), - }; + let mut builder = PageServerConfigBuilder::default(); + builder.workdir(workdir.to_owned()); for (key, item) in toml.iter() { match key { - "listen_pg_addr" => conf.listen_pg_addr = parse_toml_string(key, item)?, - "listen_http_addr" => conf.listen_http_addr = parse_toml_string(key, item)?, - "checkpoint_distance" => conf.checkpoint_distance = parse_toml_u64(key, item)?, - "checkpoint_period" => conf.checkpoint_period = parse_toml_duration(key, item)?, - "gc_horizon" => conf.gc_horizon = parse_toml_u64(key, item)?, - "gc_period" => conf.gc_period = parse_toml_duration(key, item)?, - "wait_lsn_timeout" => conf.wait_lsn_timeout = parse_toml_duration(key, item)?, - "wal_redo_timeout" => conf.wal_redo_timeout = parse_toml_duration(key, item)?, - "initial_superuser_name" => conf.superuser = parse_toml_string(key, item)?, - "page_cache_size" => conf.page_cache_size = parse_toml_u64(key, item)? as usize, + "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), + "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), + "checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?), + "checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?), + "gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?), + "gc_period" => builder.gc_period(parse_toml_duration(key, item)?), + "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), + "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), + "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), + "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), "max_file_descriptors" => { - conf.max_file_descriptors = parse_toml_u64(key, item)? as usize + builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) } "pg_distrib_dir" => { - conf.pg_distrib_dir = PathBuf::from(parse_toml_string(key, item)?) + builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?)) } - "auth_validation_public_key_path" => { - conf.auth_validation_public_key_path = - Some(PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_type" => conf.auth_type = parse_toml_auth_type(key, item)?, + "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( + PathBuf::from(parse_toml_string(key, item)?), + )), + "auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?), "remote_storage" => { - conf.remote_storage_config = Some(Self::parse_remote_storage_config(item)?) + builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?)) } + "id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)), _ => bail!("unrecognized pageserver option '{}'", key), } } + let mut conf = builder.build().context("invalid config")?; + if conf.auth_type == AuthType::ZenithJWT { let auth_validation_public_key_path = conf .auth_validation_public_key_path @@ -301,9 +465,6 @@ impl PageServerConf { ); } - if conf.pg_distrib_dir == PathBuf::new() { - conf.pg_distrib_dir = env::current_dir()?.join("tmp_install") - }; if !conf.pg_distrib_dir.join("bin/postgres").exists() { bail!( "Can't find postgres binary at {}", @@ -398,6 +559,7 @@ impl PageServerConf { #[cfg(test)] pub fn dummy_conf(repo_dir: PathBuf) -> Self { PageServerConf { + id: ZNodeId(0), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, checkpoint_period: Duration::from_secs(10), gc_horizon: defaults::DEFAULT_GC_HORIZON, @@ -482,15 +644,16 @@ max_file_descriptors = 333 # initial superuser role name to use when creating a new tenant initial_superuser_name = 'zzzz' +id = 10 - "#; +"#; #[test] fn parse_defaults() -> anyhow::Result<()> { let tempdir = tempdir()?; let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; // we have to create dummy pathes to overcome the validation errors - let config_string = format!("pg_distrib_dir='{}'", pg_distrib_dir.display()); + let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display()); let toml = config_string.parse()?; let parsed_config = @@ -501,6 +664,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE, @@ -544,6 +708,7 @@ initial_superuser_name = 'zzzz' assert_eq!( parsed_config, PageServerConf { + id: ZNodeId(10), listen_pg_addr: "127.0.0.1:64000".to_string(), listen_http_addr: "127.0.0.1:9898".to_string(), checkpoint_distance: 111, diff --git a/pageserver/src/http/models.rs b/pageserver/src/http/models.rs index 6ce377c535..9844e7ea82 100644 --- a/pageserver/src/http/models.rs +++ b/pageserver/src/http/models.rs @@ -1,17 +1,124 @@ +use crate::timelines::TimelineInfo; +use anyhow::{anyhow, bail, Context}; use serde::{Deserialize, Serialize}; - -use crate::ZTenantId; +use zenith_utils::{ + lsn::Lsn, + zid::{HexZTenantId, HexZTimelineId, ZNodeId, ZTenantId, ZTimelineId}, +}; #[derive(Serialize, Deserialize)] -pub struct BranchCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - pub name: String, - pub start_point: String, +pub struct TimelineCreateRequest { + pub new_timeline_id: Option, + pub ancestor_timeline_id: Option, + pub ancestor_start_lsn: Option, } #[derive(Serialize, Deserialize)] pub struct TenantCreateRequest { - #[serde(with = "hex")] - pub tenant_id: ZTenantId, + pub new_tenant_id: Option, +} + +#[derive(Serialize, Deserialize)] +pub struct TimelineInfoResponse { + pub kind: String, + #[serde(with = "hex")] + timeline_id: ZTimelineId, + #[serde(with = "hex")] + tenant_id: ZTenantId, + disk_consistent_lsn: String, + last_record_lsn: Option, + prev_record_lsn: Option, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + current_logical_size: Option, + current_logical_size_non_incremental: Option, +} + +impl From for TimelineInfoResponse { + fn from(other: TimelineInfo) -> Self { + match other { + TimelineInfo::Local { + timeline_id, + tenant_id, + last_record_lsn, + prev_record_lsn, + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn, + current_logical_size, + current_logical_size_non_incremental, + } => TimelineInfoResponse { + kind: "Local".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: Some(last_record_lsn.to_string()), + prev_record_lsn: Some(prev_record_lsn.to_string()), + ancestor_timeline_id: ancestor_timeline_id.map(HexZTimelineId::from), + ancestor_lsn: ancestor_lsn.map(|lsn| lsn.to_string()), + current_logical_size: Some(current_logical_size), + current_logical_size_non_incremental, + }, + TimelineInfo::Remote { + timeline_id, + tenant_id, + disk_consistent_lsn, + } => TimelineInfoResponse { + kind: "Remote".to_owned(), + timeline_id, + tenant_id, + disk_consistent_lsn: disk_consistent_lsn.to_string(), + last_record_lsn: None, + prev_record_lsn: None, + ancestor_timeline_id: None, + ancestor_lsn: None, + current_logical_size: None, + current_logical_size_non_incremental: None, + }, + } + } +} + +impl TryFrom for TimelineInfo { + type Error = anyhow::Error; + + fn try_from(other: TimelineInfoResponse) -> anyhow::Result { + let parse_lsn_hex_string = |lsn_string: String| { + lsn_string + .parse::() + .with_context(|| format!("Failed to parse Lsn as hex string from '{}'", lsn_string)) + }; + + let disk_consistent_lsn = parse_lsn_hex_string(other.disk_consistent_lsn)?; + Ok(match other.kind.as_str() { + "Local" => TimelineInfo::Local { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + last_record_lsn: other + .last_record_lsn + .ok_or(anyhow!("Local timeline should have last_record_lsn")) + .and_then(parse_lsn_hex_string)?, + prev_record_lsn: other + .prev_record_lsn + .ok_or(anyhow!("Local timeline should have prev_record_lsn")) + .and_then(parse_lsn_hex_string)?, + ancestor_timeline_id: other.ancestor_timeline_id.map(ZTimelineId::from), + ancestor_lsn: other.ancestor_lsn.map(parse_lsn_hex_string).transpose()?, + disk_consistent_lsn, + current_logical_size: other.current_logical_size.ok_or(anyhow!("No "))?, + current_logical_size_non_incremental: other.current_logical_size_non_incremental, + }, + "Remote" => TimelineInfo::Remote { + timeline_id: other.timeline_id, + tenant_id: other.tenant_id, + disk_consistent_lsn, + }, + unknown => bail!("Unknown timeline kind: {}", unknown), + }) + } +} + +#[derive(Serialize)] +pub struct StatusResponse { + pub id: ZNodeId, } diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index dcb81849e0..d322b051a6 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -17,7 +17,12 @@ paths: application/json: schema: type: object - /v1/timeline/{tenant_id}: + required: + - id + properties: + id: + type: integer + /v1/tenant/{tenant_id}/timeline: parameters: - name: tenant_id in: path @@ -25,19 +30,22 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: List tenant timelines + description: Get timelines for tenant responses: "200": - description: array of brief timeline descriptions + description: TimelineInfo content: application/json: schema: type: array items: - # currently, just a timeline id string, but when remote index gets to be accessed - # remote/local timeline field would be added at least - type: string + $ref: "#/components/schemas/TimelineInfo" "400": description: Error when no tenant id found in path content: @@ -62,7 +70,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/timeline/{tenant_id}/{timeline_id}: + /v1/tenant/{tenant_id}/timeline/{timeline_id}: parameters: - name: tenant_id in: path @@ -76,8 +84,13 @@ paths: schema: type: string format: hex + - name: include-non-incremental-logical-size + in: query + schema: + type: string + description: Controls calculation of current_logical_size_non_incremental get: - description: Get timeline info for tenant's remote timeline + description: Get info about the timeline responses: "200": description: TimelineInfo @@ -86,7 +99,7 @@ paths: schema: $ref: "#/components/schemas/TimelineInfo" "400": - description: Error when no tenant id found in path or no branch name + description: Error when no tenant id found in path or no timeline id content: application/json: schema: @@ -109,7 +122,7 @@ paths: application/json: schema: $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}: + /v1/tenant/{tenant_id}/timeline/: parameters: - name: tenant_id in: path @@ -117,126 +130,33 @@ paths: schema: type: string format: hex - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - type: array - items: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/{tenant_id}/{branch_name}: - parameters: - - name: tenant_id - in: path - required: true - schema: - type: string - format: hex - - name: branch_name - in: path - required: true - schema: - type: string - - name: include-non-incremental-logical-size - in: query - schema: - type: string - description: Controls calculation of current_logical_size_non_incremental - get: - description: Get branches for tenant - responses: - "200": - description: BranchInfo - content: - application/json: - schema: - $ref: "#/components/schemas/BranchInfo" - "400": - description: Error when no tenant id found in path or no branch name - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - "401": - description: Unauthorized Error - content: - application/json: - schema: - $ref: "#/components/schemas/UnauthorizedError" - "403": - description: Forbidden Error - content: - application/json: - schema: - $ref: "#/components/schemas/ForbiddenError" - "500": - description: Generic operation error - content: - application/json: - schema: - $ref: "#/components/schemas/Error" - /v1/branch/: post: - description: Create branch + description: | + Create a timeline. Returns new timeline id on success.\ + If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline. requestBody: content: application/json: schema: type: object - required: - - "tenant_id" - - "name" - - "start_point" properties: - tenant_id: + new_timeline_id: type: string format: hex - name: + ancestor_timeline_id: type: string - start_point: + format: hex + ancestor_start_lsn: type: string responses: "201": - description: BranchInfo + description: TimelineInfo content: application/json: schema: - $ref: "#/components/schemas/BranchInfo" + $ref: "#/components/schemas/TimelineInfo" "400": - description: Malformed branch create request + description: Malformed timeline create request content: application/json: schema: @@ -253,6 +173,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Timeline already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -290,27 +216,26 @@ paths: schema: $ref: "#/components/schemas/Error" post: - description: Create tenant + description: | + Create a tenant. Returns new tenant id on success.\ + If no new tenant id is specified in parameters, it would be generated. It's an error to recreate the same tenant. requestBody: content: application/json: schema: type: object - required: - - "tenant_id" properties: - tenant_id: + new_tenant_id: type: string format: hex responses: "201": - description: CREATED + description: New tenant created successfully content: application/json: schema: - type: array - items: - type: string + type: string + format: hex "400": description: Malformed tenant create request content: @@ -329,6 +254,12 @@ paths: application/json: schema: $ref: "#/components/schemas/ForbiddenError" + "409": + description: Tenant already exists, creation skipped + content: + application/json: + schema: + $ref: "#/components/schemas/AlreadyExistsError" "500": description: Generic operation error content: @@ -353,38 +284,11 @@ components: type: string state: type: string - BranchInfo: - type: object - required: - - name - - timeline_id - - latest_valid_lsn - - current_logical_size - properties: - name: - type: string - timeline_id: - type: string - format: hex - ancestor_id: - type: string - format: hex - ancestor_lsn: - type: string - current_logical_size: - type: integer - current_logical_size_non_incremental: - type: integer - latest_valid_lsn: - type: integer TimelineInfo: type: object required: - timeline_id - tenant_id - - last_record_lsn - - prev_record_lsn - - start_lsn - disk_consistent_lsn properties: timeline_id: @@ -393,19 +297,21 @@ components: tenant_id: type: string format: hex - ancestor_timeline_id: - type: string - format: hex last_record_lsn: type: string prev_record_lsn: type: string - start_lsn: + ancestor_timeline_id: + type: string + format: hex + ancestor_lsn: type: string disk_consistent_lsn: type: string - timeline_state: - type: string + current_logical_size: + type: integer + current_logical_size_non_incremental: + type: integer Error: type: object @@ -421,6 +327,13 @@ components: properties: msg: type: string + AlreadyExistsError: + type: object + required: + - msg + properties: + msg: + type: string ForbiddenError: type: object required: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index b13a45750e..8365601042 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1,10 +1,8 @@ use std::sync::Arc; -use anyhow::{Context, Result}; -use hyper::header; +use anyhow::Result; use hyper::StatusCode; use hyper::{Body, Request, Response, Uri}; -use serde::Serialize; use tracing::*; use zenith_utils::auth::JwtAuth; use zenith_utils::http::endpoint::attach_openapi_ui; @@ -15,19 +13,17 @@ use zenith_utils::http::{ endpoint, error::HttpErrorBody, json::{json_request, json_response}, - request::get_request_param, request::parse_request_param, }; use zenith_utils::http::{RequestExt, RouterBuilder}; -use zenith_utils::lsn::Lsn; -use zenith_utils::zid::{opt_display_serde, ZTimelineId}; +use zenith_utils::zid::{HexZTenantId, ZTimelineId}; -use super::models::BranchCreateRequest; -use super::models::TenantCreateRequest; -use crate::branches::BranchInfo; +use super::models::{ + StatusResponse, TenantCreateRequest, TimelineCreateRequest, TimelineInfoResponse, +}; use crate::repository::RepositoryTimeline; -use crate::repository::TimelineSyncState; -use crate::{branches, config::PageServerConf, tenant_mgr, ZTenantId}; +use crate::timelines::TimelineInfo; +use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId}; #[derive(Debug)] struct State { @@ -64,31 +60,53 @@ fn get_config(request: &Request) -> &'static PageServerConf { } // healthcheck handler -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(Response::builder() - .status(StatusCode::OK) - .header(header::CONTENT_TYPE, "application/json") - .body(Body::from("{}")) - .map_err(ApiError::from_err)?) +async fn status_handler(request: Request) -> Result, ApiError> { + let config = get_config(&request); + Ok(json_response( + StatusCode::OK, + StatusResponse { id: config.id }, + )?) } -async fn branch_create_handler(mut request: Request) -> Result, ApiError> { - let request_data: BranchCreateRequest = json_request(&mut request).await?; +async fn timeline_create_handler(mut request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + let request_data: TimelineCreateRequest = json_request(&mut request).await?; - check_permission(&request, Some(request_data.tenant_id))?; + check_permission(&request, Some(tenant_id))?; - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered(); - branches::create_branch( + let new_timeline_info = tokio::task::spawn_blocking(move || { + let _enter = info_span!("/timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn).entered(); + timelines::create_timeline( get_config(&request), - &request_data.name, - &request_data.start_point, - &request_data.tenant_id, + tenant_id, + request_data.new_timeline_id.map(ZTimelineId::from), + request_data.ancestor_timeline_id.map(ZTimelineId::from), + request_data.ancestor_start_lsn, ) }) .await .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::CREATED, response_data)?) + + Ok(match new_timeline_info { + Some(info) => json_response(StatusCode::CREATED, TimelineInfoResponse::from(info))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) +} + +async fn timeline_list_handler(request: Request) -> Result, ApiError> { + let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; + check_permission(&request, Some(tenant_id))?; + let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); + let response_data: Vec = tokio::task::spawn_blocking(move || { + let _enter = info_span!("timeline_list", tenant = %tenant_id).entered(); + crate::timelines::get_timelines(tenant_id, include_non_incremental_logical_size) + }) + .await + .map_err(ApiError::from_err)?? + .into_iter() + .map(TimelineInfoResponse::from) + .collect(); + Ok(json_response(StatusCode::OK, response_data)?) } // Gate non incremental logical size calculation behind a flag @@ -106,113 +124,6 @@ fn get_include_non_incremental_logical_size(request: &Request) -> bool { .unwrap_or(false) } -async fn branch_list_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - check_permission(&request, Some(tenantid))?; - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_list", tenant = %tenantid).entered(); - crate::branches::get_branches( - get_config(&request), - &tenantid, - include_non_incremental_logical_size, - ) - }) - .await - .map_err(ApiError::from_err)??; - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn branch_detail_handler(request: Request) -> Result, ApiError> { - let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?; - let branch_name: String = get_request_param(&request, "branch_name")?.to_string(); - let conf = get_state(&request).conf; - let path = conf.branch_path(&branch_name, &tenantid); - - let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request); - - let response_data = tokio::task::spawn_blocking(move || { - let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered(); - let repo = tenant_mgr::get_repository_for_tenant(tenantid)?; - BranchInfo::from_path(path, &repo, include_non_incremental_logical_size) - }) - .await - .map_err(ApiError::from_err)??; - - Ok(json_response(StatusCode::OK, response_data)?) -} - -async fn timeline_list_handler(request: Request) -> Result, ApiError> { - let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; - check_permission(&request, Some(tenant_id))?; - - let conf = get_state(&request).conf; - let timelines_dir = conf.timelines_path(&tenant_id); - - let mut timelines_dir_contents = - tokio::fs::read_dir(&timelines_dir).await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })?; - - let mut local_timelines = Vec::new(); - while let Some(entry) = timelines_dir_contents.next_entry().await.with_context(|| { - format!( - "Failed to list timelines dir '{}' contents", - timelines_dir.display() - ) - })? { - let entry_path = entry.path(); - let entry_type = entry.file_type().await.with_context(|| { - format!( - "Failed to get file type of timeline dirs' entry '{}'", - entry_path.display() - ) - })?; - - if entry_type.is_dir() { - match entry.file_name().to_string_lossy().parse::() { - Ok(timeline_id) => local_timelines.push(timeline_id.to_string()), - Err(e) => error!( - "Failed to get parse timeline id from timeline dirs' entry '{}': {}", - entry_path.display(), - e - ), - } - } - } - - Ok(json_response(StatusCode::OK, local_timelines)?) -} - -#[derive(Debug, Serialize)] -#[serde(tag = "type")] -enum TimelineInfo { - Local { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - #[serde(with = "opt_display_serde")] - ancestor_timeline_id: Option, - last_record_lsn: Lsn, - prev_record_lsn: Lsn, - disk_consistent_lsn: Lsn, - timeline_state: Option, - }, - Remote { - #[serde(with = "hex")] - timeline_id: ZTimelineId, - #[serde(with = "hex")] - tenant_id: ZTenantId, - }, -} - async fn timeline_detail_handler(request: Request) -> Result, ApiError> { let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?; check_permission(&request, Some(tenant_id))?; @@ -224,24 +135,17 @@ async fn timeline_detail_handler(request: Request) -> Result(match repo.get_timeline(timeline_id)?.local_timeline() { - None => TimelineInfo::Remote { - timeline_id, - tenant_id, - }, - Some(timeline) => TimelineInfo::Local { - timeline_id, - tenant_id, - ancestor_timeline_id: timeline.get_ancestor_timeline_id(), - disk_consistent_lsn: timeline.get_disk_consistent_lsn(), - last_record_lsn: timeline.get_last_record_lsn(), - prev_record_lsn: timeline.get_prev_record_lsn(), - timeline_state: repo.get_timeline_state(timeline_id), - }, - }) + let include_non_incremental_logical_size = + get_include_non_incremental_logical_size(&request); + Ok::<_, anyhow::Error>(TimelineInfo::from_repo_timeline( + tenant_id, + repo.get_timeline(timeline_id)?, + include_non_incremental_logical_size, + )) }) .await - .map_err(ApiError::from_err)??; + .map_err(ApiError::from_err)? + .map(TimelineInfoResponse::from)?; Ok(json_response(StatusCode::OK, response_data)?) } @@ -258,7 +162,7 @@ async fn timeline_attach_handler(request: Request) -> Result { + RepositoryTimeline::Local { .. } => { anyhow::bail!("Timeline with id {} is already local", timeline_id) } RepositoryTimeline::Remote { @@ -318,13 +222,20 @@ async fn tenant_create_handler(mut request: Request) -> Result json_response(StatusCode::CREATED, HexZTenantId::from(id))?, + None => json_response(StatusCode::CONFLICT, ())?, + }) } async fn handler_404(_: Request) -> Result, ApiError> { @@ -354,23 +265,21 @@ pub fn make_router( router .data(Arc::new(State::new(conf, auth))) .get("/v1/status", status_handler) - .get("/v1/timeline/:tenant_id", timeline_list_handler) + .get("/v1/tenant", tenant_list_handler) + .post("/v1/tenant", tenant_create_handler) + .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler) + .post("/v1/tenant/:tenant_id/timeline", timeline_create_handler) .get( - "/v1/timeline/:tenant_id/:timeline_id", + "/v1/tenant/:tenant_id/timeline/:timeline_id", timeline_detail_handler, ) .post( - "/v1/timeline/:tenant_id/:timeline_id/attach", + "/v1/tenant/:tenant_id/timeline/:timeline_id/attach", timeline_attach_handler, ) .post( - "/v1/timeline/:tenant_id/:timeline_id/detach", + "/v1/tenant/:tenant_id/timeline/:timeline_id/detach", timeline_detach_handler, ) - .get("/v1/branch/:tenant_id", branch_list_handler) - .get("/v1/branch/:tenant_id/:branch_name", branch_detail_handler) - .post("/v1/branch", branch_create_handler) - .get("/v1/tenant", tenant_list_handler) - .post("/v1/tenant", tenant_create_handler) .any(handler_404) } diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 5dae1902c1..9e0df5dab2 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -47,10 +47,8 @@ use crate::walredo::WalRedoManager; use crate::CheckpointConfig; use crate::{ZTenantId, ZTimelineId}; -use zenith_metrics::{ - register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec, -}; -use zenith_metrics::{register_histogram_vec, HistogramVec}; +use zenith_metrics::{register_histogram_vec, Histogram, HistogramVec}; +use zenith_metrics::{register_int_gauge_vec, IntGauge, IntGaugeVec}; use zenith_utils::crashsafe_dir; use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn}; use zenith_utils::seqwait::SeqWait; @@ -87,16 +85,17 @@ lazy_static! { static ref STORAGE_TIME: HistogramVec = register_histogram_vec!( "pageserver_storage_time", "Time spent on storage operations", - &["operation"] + &["operation", "tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } // Metrics collected on operations on the storage repository. lazy_static! { - static ref RECONSTRUCT_TIME: Histogram = register_histogram!( + static ref RECONSTRUCT_TIME: HistogramVec = register_histogram_vec!( "pageserver_getpage_reconstruct_time", - "FIXME Time spent on storage operations" + "Time spent on storage operations", + &["tenant_id", "timeline_id"] ) .expect("failed to define a metric"); } @@ -137,19 +136,20 @@ pub struct LayeredRepository { /// Public interface impl Repository for LayeredRepository { fn get_timeline(&self, timelineid: ZTimelineId) -> Result { - let mut timelines = self.timelines.lock().unwrap(); - Ok( - match self.get_or_init_timeline(timelineid, &mut timelines)? { - LayeredTimelineEntry::Local(local) => RepositoryTimeline::Local(local), - LayeredTimelineEntry::Remote { - id, - disk_consistent_lsn, - } => RepositoryTimeline::Remote { - id, - disk_consistent_lsn, - }, - }, - ) + Ok(RepositoryTimeline::from(self.get_or_init_timeline( + timelineid, + &mut self.timelines.lock().unwrap(), + )?)) + } + + fn list_timelines(&self) -> Result> { + Ok(self + .timelines + .lock() + .unwrap() + .values() + .map(|timeline_entry| RepositoryTimeline::from(timeline_entry.clone())) + .collect()) } fn create_empty_timeline( @@ -247,8 +247,12 @@ impl Repository for LayeredRepository { horizon: u64, checkpoint_before_gc: bool, ) -> Result { + let timeline_str = target_timelineid + .map(|x| x.to_string()) + .unwrap_or_else(|| "-".to_string()); + STORAGE_TIME - .with_label_values(&["gc"]) + .with_label_values(&["gc", &self.tenantid.to_string(), &timeline_str]) .observe_closure_duration(|| { self.gc_iteration_internal(target_timelineid, horizon, checkpoint_before_gc) }) @@ -428,6 +432,24 @@ impl LayeredTimelineEntry { } } +impl From for RepositoryTimeline { + fn from(layered_timeline: LayeredTimelineEntry) -> Self { + match layered_timeline { + LayeredTimelineEntry::Local(timeline) => RepositoryTimeline::Local { + id: timeline.timelineid, + timeline, + }, + LayeredTimelineEntry::Remote { + id, + disk_consistent_lsn, + } => RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + }, + } + } +} + /// Private functions impl LayeredRepository { // Implementation of the public `get_timeline` function. This differs from the public @@ -762,6 +784,12 @@ pub struct LayeredTimeline { // ordering for its operations, but involves private modules, and macro trickery current_logical_size_gauge: IntGauge, + // Metrics histograms + reconstruct_time_histo: Histogram, + checkpoint_time_histo: Histogram, + flush_checkpoint_time_histo: Histogram, + forced_checkpoint_time_histo: Histogram, + /// If `true`, will backup its files that appear after each checkpointing to the remote storage. upload_relishes: AtomicBool, @@ -840,8 +868,7 @@ impl Timeline for LayeredTimeline { let (seg, seg_blknum) = SegmentTag::from_blknum(rel, rel_blknum); if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - RECONSTRUCT_TIME - .observe_closure_duration(|| self.materialize_page(seg, seg_blknum, lsn, &*layer)) + self.materialize_page(seg, seg_blknum, lsn, &*layer) } else { // FIXME: This can happen if PostgreSQL extends a relation but never writes // the page. See https://github.com/zenithdb/zenith/issues/841 @@ -893,12 +920,11 @@ impl Timeline for LayeredTimeline { let seg = SegmentTag { rel, segno: 0 }; - let result; - if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { - result = layer.get_seg_exists(lsn)?; + let result = if let Some((layer, lsn)) = self.get_layer_for_read(seg, lsn)? { + layer.get_seg_exists(lsn)? } else { - result = false; - } + false + }; trace!("get_rel_exists: {} at {} -> {}", rel, lsn, result); Ok(result) @@ -992,14 +1018,14 @@ impl Timeline for LayeredTimeline { /// metrics collection. fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()> { match cconf { - CheckpointConfig::Flush => STORAGE_TIME - .with_label_values(&["flush checkpoint"]) + CheckpointConfig::Flush => self + .flush_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, false)), - CheckpointConfig::Forced => STORAGE_TIME - .with_label_values(&["forced checkpoint"]) + CheckpointConfig::Forced => self + .forced_checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(0, true)), - CheckpointConfig::Distance(distance) => STORAGE_TIME - .with_label_values(&["checkpoint"]) + CheckpointConfig::Distance(distance) => self + .checkpoint_time_histo .observe_closure_duration(|| self.checkpoint_internal(distance, true)), } } @@ -1098,6 +1124,31 @@ impl LayeredTimeline { let current_logical_size_gauge = LOGICAL_TIMELINE_SIZE .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) .unwrap(); + let reconstruct_time_histo = RECONSTRUCT_TIME + .get_metric_with_label_values(&[&tenantid.to_string(), &timelineid.to_string()]) + .unwrap(); + let checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let flush_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "flush checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + let forced_checkpoint_time_histo = STORAGE_TIME + .get_metric_with_label_values(&[ + "forced checkpoint", + &tenantid.to_string(), + &timelineid.to_string(), + ]) + .unwrap(); + LayeredTimeline { conf, timelineid, @@ -1117,6 +1168,10 @@ impl LayeredTimeline { ancestor_lsn: metadata.ancestor_lsn(), current_logical_size: AtomicUsize::new(current_logical_size), current_logical_size_gauge, + reconstruct_time_histo, + checkpoint_time_histo, + flush_checkpoint_time_histo, + forced_checkpoint_time_histo, upload_relishes: AtomicBool::new(upload_relishes), write_lock: Mutex::new(()), @@ -1966,17 +2021,19 @@ impl LayeredTimeline { let mut layer_ref = layer; let mut curr_lsn = lsn; loop { - let result = layer_ref - .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) - .with_context(|| { - format!( - "Failed to get reconstruct data {} {:?} {} {}", - layer_ref.get_seg_tag(), - layer_ref.filename(), - seg_blknum, - curr_lsn, - ) - })?; + let result = self.reconstruct_time_histo.observe_closure_duration(|| { + layer_ref + .get_page_reconstruct_data(seg_blknum, curr_lsn, &mut data) + .with_context(|| { + format!( + "Failed to get reconstruct data {} {:?} {} {}", + layer_ref.get_seg_tag(), + layer_ref.filename(), + seg_blknum, + curr_lsn, + ) + }) + })?; match result { PageReconstructResult::Complete => break, PageReconstructResult::Continue(cont_lsn) => { diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 17b061b20e..6e24bf6022 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -170,12 +170,11 @@ impl Layer for InMemoryLayer { fn filename(&self) -> PathBuf { let inner = self.inner.read().unwrap(); - let end_lsn; - if let Some(drop_lsn) = inner.end_lsn { - end_lsn = drop_lsn; + let end_lsn = if let Some(drop_lsn) = inner.end_lsn { + drop_lsn } else { - end_lsn = Lsn(u64::MAX); - } + Lsn(u64::MAX) + }; let delta_filename = DeltaFileName { seg: self.seg, diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 3a68f56187..3d66192c80 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -1,5 +1,4 @@ pub mod basebackup; -pub mod branches; pub mod config; pub mod http; pub mod import_datadir; @@ -12,6 +11,7 @@ pub mod repository; pub mod tenant_mgr; pub mod tenant_threads; pub mod thread_mgr; +pub mod timelines; pub mod virtual_file; pub mod walingest; pub mod walreceiver; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7dc3c8c752..42a099cca5 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -298,7 +298,7 @@ lazy_static! { static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!( "pageserver_smgr_query_time", "Time spent on smgr query handling", - &["smgr_query_type"], + &["smgr_query_type", "tenant_id", "timeline_id"], TIME_BUCKETS.into() ) .expect("failed to define a metric"); @@ -340,20 +340,22 @@ impl PageServerHandler { }; let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?; + let tenant_id = tenantid.to_string(); + let timeline_id = timelineid.to_string(); let response = match zenith_fe_msg { PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_exists"]) + .with_label_values(&["get_rel_exists", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_rel_exists_request(timeline.as_ref(), &req) }), PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME - .with_label_values(&["get_rel_size"]) + .with_label_values(&["get_rel_size", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_nblocks_request(timeline.as_ref(), &req) }), PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME - .with_label_values(&["get_page_at_lsn"]) + .with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id]) .observe_closure_duration(|| { self.handle_get_page_at_lsn_request(timeline.as_ref(), &req) }), diff --git a/pageserver/src/remote_storage/README.md b/pageserver/src/remote_storage/README.md index 1c718acf06..3c77275da8 100644 --- a/pageserver/src/remote_storage/README.md +++ b/pageserver/src/remote_storage/README.md @@ -62,11 +62,3 @@ Based on previous evaluation, even `rusoto-s3` could be a better choice over thi So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage. Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives. - -* bracnhes implementaion could be improved - -Currently, there's a code to sync the branches along with the timeline files: on upload, every local branch files that are missing remotely are uploaded, -on the timeline download, missing remote branch files are downlaoded. - -A branch is a per-tenant entity, yet a current implementaion requires synchronizing a timeline first to get the branch files locally. -Currently, there's no other way to know about the remote branch files, neither the file contents is verified and updated. diff --git a/pageserver/src/remote_storage/storage_sync.rs b/pageserver/src/remote_storage/storage_sync.rs index 6b588c8e5f..d14f849e15 100644 --- a/pageserver/src/remote_storage/storage_sync.rs +++ b/pageserver/src/remote_storage/storage_sync.rs @@ -14,13 +14,6 @@ //! Only GC removes local timeline files, the GC support is not added to sync currently, //! yet downloading extra files is not critically bad at this stage, GC can remove those again. //! -//! Along the timeline files, branch files are uploaded and downloaded every time a corresponding sync task is processed. -//! For simplicity, branch files are also treated as immutable: only missing files are uploaded or downloaded, no removals, amendments or file contents checks are done. -//! Also, the branches are copied as separate files, with no extra compressions done. -//! Despite branches information currently belonging to tenants, a tenants' timeline sync is required to upload or download the branch files, also, there's no way to know -//! the branch sync state outside of the sync loop. -//! This implementation is currently considered as temporary and is a subjec to change later. -//! //! During the loop startup, an initial [`RemoteTimelineIndex`] state is constructed via listing the remote storage contents. //! It's enough to poll the remote state once on startup only, due to agreement that the pageserver has //! an exclusive write access to the remote storage: new files appear in the storage only after the same @@ -66,7 +59,6 @@ //! NOTE: No real contents or checksum check happens right now and is a subject to improve later. //! //! After the whole timeline is downloaded, [`crate::tenant_mgr::set_timeline_states`] function is used to update pageserver memory stage for the timeline processed. -//! No extra branch registration is done. //! //! When pageserver signals shutdown, current sync task gets finished and the loop exists. @@ -77,7 +69,7 @@ pub mod index; mod upload; use std::{ - collections::{BTreeSet, HashMap, HashSet, VecDeque}, + collections::{BTreeSet, HashMap, VecDeque}, num::{NonZeroU32, NonZeroUsize}, path::{Path, PathBuf}, sync::Arc, @@ -87,7 +79,6 @@ use anyhow::{bail, Context}; use futures::stream::{FuturesUnordered, StreamExt}; use lazy_static::lazy_static; use tokio::{ - fs, runtime::Runtime, sync::{ mpsc::{self, UnboundedReceiver}, @@ -101,8 +92,7 @@ use self::{ compression::ArchiveHeader, download::{download_timeline, DownloadedTimeline}, index::{ - ArchiveDescription, ArchiveId, RelativePath, RemoteTimeline, RemoteTimelineIndex, - TimelineIndexEntry, + ArchiveDescription, ArchiveId, RemoteTimeline, RemoteTimelineIndex, TimelineIndexEntry, }, upload::upload_timeline_checkpoint, }; @@ -843,28 +833,6 @@ async fn download_archive_header< Ok(header) } -async fn tenant_branch_files( - conf: &'static PageServerConf, - tenant_id: ZTenantId, -) -> anyhow::Result> { - let branches_dir = conf.branches_path(&tenant_id); - if !branches_dir.exists() { - return Ok(HashSet::new()); - } - - let mut branch_entries = fs::read_dir(&branches_dir) - .await - .context("Failed to list tenant branches dir contents")?; - - let mut branch_files = HashSet::new(); - while let Some(branch_entry) = branch_entries.next_entry().await? { - if branch_entry.file_type().await?.is_file() { - branch_files.insert(RelativePath::new(&branches_dir, branch_entry.path())?); - } - } - Ok(branch_files) -} - #[cfg(test)] mod test_utils { use std::{ @@ -971,30 +939,9 @@ mod test_utils { "Index contains unexpected sync ids" ); - let mut actual_branches = BTreeMap::new(); - let mut expected_branches = BTreeMap::new(); let mut actual_timeline_entries = BTreeMap::new(); let mut expected_timeline_entries = BTreeMap::new(); for sync_id in actual_sync_ids { - actual_branches.insert( - sync_id.tenant_id, - index_read - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - expected_branches.insert( - sync_id.tenant_id, - expected_index_with_descriptions - .branch_files(sync_id.tenant_id) - .into_iter() - .flat_map(|branch_paths| branch_paths.iter()) - .cloned() - .collect::>(), - ); - actual_timeline_entries.insert( sync_id, index_read.timeline_entry(&sync_id).unwrap().clone(), @@ -1009,11 +956,6 @@ mod test_utils { } drop(index_read); - assert_eq!( - actual_branches, expected_branches, - "Index contains unexpected branches" - ); - for (sync_id, actual_timeline_entry) in actual_timeline_entries { let expected_timeline_description = expected_timeline_entries .remove(&sync_id) diff --git a/pageserver/src/remote_storage/storage_sync/download.rs b/pageserver/src/remote_storage/storage_sync/download.rs index f268fc442a..00115ba8d5 100644 --- a/pageserver/src/remote_storage/storage_sync/download.rs +++ b/pageserver/src/remote_storage/storage_sync/download.rs @@ -1,10 +1,8 @@ //! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory. -//! Currently, tenant branch files are also downloaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; use tokio::{fs, sync::RwLock}; use tracing::{debug, error, trace, warn}; use zenith_utils::{lsn::Lsn, zid::ZTenantId}; @@ -14,8 +12,8 @@ use crate::{ layered_repository::metadata::{metadata_path, TimelineMetadata}, remote_storage::{ storage_sync::{ - compression, index::TimelineIndexEntry, sync_queue, tenant_branch_files, - update_index_description, SyncKind, SyncTask, + compression, index::TimelineIndexEntry, sync_queue, update_index_description, SyncKind, + SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -42,8 +40,6 @@ pub(super) enum DownloadedTimeline { /// Timeline files that already exist locally are skipped during the download, but the local metadata file is /// updated in the end of every checkpoint archive extraction. /// -/// Before any archives are considered, the branch files are checked locally and remotely, all remote-only files are downloaded. -/// /// On an error, bumps the retries count and reschedules the download, with updated archive skip list /// (for any new successful archive downloads and extractions). pub(super) async fn download_timeline< @@ -113,22 +109,6 @@ pub(super) async fn download_timeline< } }; - if let Err(e) = download_missing_branches(conf, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to download missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Download(download), - )); - return DownloadedTimeline::FailedAndRescheduled { - disk_consistent_lsn, - }; - } - debug!("Downloading timeline archives"); let archives_to_download = remote_timeline .checkpoints() @@ -250,82 +230,6 @@ async fn read_local_metadata( .context("Failed to read local metadata files bytes")?) } -async fn download_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - conf: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(conf, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let local_branches_dir = conf.branches_path(&tenant_id); - if !local_branches_dir.exists() { - fs::create_dir_all(&local_branches_dir) - .await - .with_context(|| { - format!( - "Failed to create local branches directory at path '{}'", - local_branches_dir.display() - ) - })?; - } - - if let Some(remote_branches) = index.read().await.branch_files(tenant_id) { - let mut remote_only_branches_downloads = remote_branches - .difference(&local_branches) - .map(|remote_only_branch| async move { - let branches_dir = conf.branches_path(&tenant_id); - let remote_branch_path = remote_only_branch.as_path(&branches_dir); - let storage_path = - storage.storage_path(&remote_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - remote_branch_path.display() - ) - })?; - let mut target_file = fs::OpenOptions::new() - .write(true) - .create_new(true) - .open(&remote_branch_path) - .await - .with_context(|| { - format!( - "Failed to create local branch file at '{}'", - remote_branch_path.display() - ) - })?; - storage - .download(&storage_path, &mut target_file) - .await - .with_context(|| { - format!( - "Failed to download branch file from the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(()) - }) - .collect::>(); - - let mut branch_downloads_failed = false; - while let Some(download_result) = remote_only_branches_downloads.next().await { - if let Err(e) = download_result { - branch_downloads_failed = true; - error!("Failed to download a branch file: {:?}", e); - } - } - ensure!( - !branch_downloads_failed, - "Failed to download all branch files" - ); - } - - Ok(()) -} - #[cfg(test)] mod tests { use std::collections::BTreeSet; diff --git a/pageserver/src/remote_storage/storage_sync/index.rs b/pageserver/src/remote_storage/storage_sync/index.rs index 3d2680948d..81c99754c9 100644 --- a/pageserver/src/remote_storage/storage_sync/index.rs +++ b/pageserver/src/remote_storage/storage_sync/index.rs @@ -5,7 +5,7 @@ //! This way in the future, the index could be restored fast from its serialized stored form. use std::{ - collections::{BTreeMap, BTreeSet, HashMap, HashSet}, + collections::{BTreeMap, BTreeSet, HashMap}, path::{Path, PathBuf}, }; @@ -49,10 +49,9 @@ impl RelativePath { } /// An index to track tenant files that exist on the remote storage. -/// Currently, timeline archives and branch files are tracked. +/// Currently, timeline archive files are tracked only. #[derive(Debug, Clone)] pub struct RemoteTimelineIndex { - branch_files: HashMap>, timeline_files: HashMap, } @@ -65,7 +64,6 @@ impl RemoteTimelineIndex { paths: impl Iterator, ) -> Self { let mut index = Self { - branch_files: HashMap::new(), timeline_files: HashMap::new(), }; for path in paths { @@ -98,17 +96,6 @@ impl RemoteTimelineIndex { pub fn all_sync_ids(&self) -> impl Iterator + '_ { self.timeline_files.keys().copied() } - - pub fn add_branch_file(&mut self, tenant_id: ZTenantId, path: RelativePath) { - self.branch_files - .entry(tenant_id) - .or_insert_with(HashSet::new) - .insert(path); - } - - pub fn branch_files(&self, tenant_id: ZTenantId) -> Option<&HashSet> { - self.branch_files.get(&tenant_id) - } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -306,20 +293,9 @@ fn try_parse_index_entry( .parse::() .with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?; - let branches_path = conf.branches_path(&tenant_id); let timelines_path = conf.timelines_path(&tenant_id); - match ( - RelativePath::new(&branches_path, &path), - path.strip_prefix(&timelines_path), - ) { - (Ok(_), Ok(_)) => bail!( - "Path '{}' cannot start with both branches '{}' and the timelines '{}' prefixes", - path.display(), - branches_path.display(), - timelines_path.display() - ), - (Ok(branches_entry), Err(_)) => index.add_branch_file(tenant_id, branches_entry), - (Err(_), Ok(timelines_subpath)) => { + match path.strip_prefix(&timelines_path) { + Ok(timelines_subpath) => { let mut segments = timelines_subpath.iter(); let timeline_id = segments .next() @@ -375,11 +351,10 @@ fn try_parse_index_entry( } } } - (Err(branches_error), Err(timelines_strip_error)) => { + Err(timelines_strip_error) => { bail!( - "Path '{}' is not an index entry: it's neither parsable as a branch entry '{:#}' nor as an archive entry '{}'", + "Path '{}' is not an archive entry '{}'", path.display(), - branches_error, timelines_strip_error, ) } diff --git a/pageserver/src/remote_storage/storage_sync/upload.rs b/pageserver/src/remote_storage/storage_sync/upload.rs index 0f57d714dd..d064039ecc 100644 --- a/pageserver/src/remote_storage/storage_sync/upload.rs +++ b/pageserver/src/remote_storage/storage_sync/upload.rs @@ -1,13 +1,10 @@ //! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints. -//! Currently, tenant branch files are also uploaded, but this does not appear final. use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc}; -use anyhow::{ensure, Context}; -use futures::{stream::FuturesUnordered, StreamExt}; -use tokio::{fs, sync::RwLock}; +use anyhow::ensure; +use tokio::sync::RwLock; use tracing::{debug, error, warn}; -use zenith_utils::zid::ZTenantId; use crate::{ config::PageServerConf, @@ -15,7 +12,7 @@ use crate::{ storage_sync::{ compression, index::{RemoteTimeline, TimelineIndexEntry}, - sync_queue, tenant_branch_files, update_index_description, SyncKind, SyncTask, + sync_queue, update_index_description, SyncKind, SyncTask, }, RemoteStorage, ZTenantTimelineId, }, @@ -26,8 +23,6 @@ use super::{compression::ArchiveHeader, index::RemoteTimelineIndex, NewCheckpoin /// Attempts to compress and upload given checkpoint files. /// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten. /// -/// Before the checkpoint files are uploaded, branch files are uploaded, if any local ones are missing remotely. -/// /// On an error, bumps the retries count and reschedules the entire task. /// On success, populates index data with new downloads. pub(super) async fn upload_timeline_checkpoint< @@ -41,19 +36,6 @@ pub(super) async fn upload_timeline_checkpoint< retries: u32, ) -> Option { debug!("Uploading checkpoint for sync id {}", sync_id); - if let Err(e) = upload_missing_branches(config, remote_assets.as_ref(), sync_id.tenant_id).await - { - error!( - "Failed to upload missing branches for sync id {}: {:?}", - sync_id, e - ); - sync_queue::push(SyncTask::new( - sync_id, - retries, - SyncKind::Upload(new_checkpoint), - )); - return Some(false); - } let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn(); let index = &remote_assets.1; @@ -201,76 +183,6 @@ async fn try_upload_checkpoint< .map(|(header, header_size, _)| (header, header_size)) } -async fn upload_missing_branches< - P: std::fmt::Debug + Send + Sync + 'static, - S: RemoteStorage + Send + Sync + 'static, ->( - config: &'static PageServerConf, - (storage, index): &(S, RwLock), - tenant_id: ZTenantId, -) -> anyhow::Result<()> { - let local_branches = tenant_branch_files(config, tenant_id) - .await - .context("Failed to list local branch files for the tenant")?; - let index_read = index.read().await; - let remote_branches = index_read - .branch_files(tenant_id) - .cloned() - .unwrap_or_default(); - drop(index_read); - - let mut branch_uploads = local_branches - .difference(&remote_branches) - .map(|local_only_branch| async move { - let local_branch_path = local_only_branch.as_path(&config.branches_path(&tenant_id)); - let storage_path = storage.storage_path(&local_branch_path).with_context(|| { - format!( - "Failed to derive a storage path for branch with local path '{}'", - local_branch_path.display() - ) - })?; - let local_branch_file = fs::OpenOptions::new() - .read(true) - .open(&local_branch_path) - .await - .with_context(|| { - format!( - "Failed to open local branch file {} for reading", - local_branch_path.display() - ) - })?; - storage - .upload(local_branch_file, &storage_path) - .await - .with_context(|| { - format!( - "Failed to upload branch file to the remote path {:?}", - storage_path - ) - })?; - Ok::<_, anyhow::Error>(local_only_branch) - }) - .collect::>(); - - let mut branch_uploads_failed = false; - while let Some(upload_result) = branch_uploads.next().await { - match upload_result { - Ok(local_only_branch) => index - .write() - .await - .add_branch_file(tenant_id, local_only_branch.clone()), - Err(e) => { - error!("Failed to upload branch file: {:?}", e); - branch_uploads_failed = true; - } - } - } - - ensure!(!branch_uploads_failed, "Failed to upload all branch files"); - - Ok(()) -} - #[cfg(test)] mod tests { use tempfile::tempdir; diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 6142953a58..be937b8d26 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -36,6 +36,10 @@ pub trait Repository: Send + Sync { /// Get Timeline handle for given zenith timeline ID. fn get_timeline(&self, timelineid: ZTimelineId) -> Result; + /// Lists timelines the repository contains. + /// Up to repository's implementation to omit certain timelines that ar not considered ready for use. + fn list_timelines(&self) -> Result>; + /// Create a new, empty timeline. The caller is responsible for loading data into it /// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it. fn create_empty_timeline( @@ -72,7 +76,10 @@ pub trait Repository: Send + Sync { pub enum RepositoryTimeline { /// Timeline, with its files present locally in pageserver's working directory. /// Loaded into pageserver's memory and ready to be used. - Local(Arc), + Local { + id: ZTimelineId, + timeline: Arc, + }, /// Timeline, found on the pageserver's remote storage, but not yet downloaded locally. Remote { id: ZTimelineId, @@ -83,17 +90,24 @@ pub enum RepositoryTimeline { impl RepositoryTimeline { pub fn local_timeline(&self) -> Option> { - if let Self::Local(local_timeline) = self { - Some(Arc::clone(local_timeline)) + if let Self::Local { timeline, .. } = self { + Some(Arc::clone(timeline)) } else { None } } + + pub fn id(&self) -> ZTimelineId { + match self { + Self::Local { id, .. } => *id, + Self::Remote { id, .. } => *id, + } + } } /// A state of the timeline synchronization with the remote storage. /// Contains `disk_consistent_lsn` of the corresponding remote timeline (latest checkpoint's disk_consistent_lsn). -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum TimelineSyncState { /// No further downloads from the remote storage are needed. /// The timeline state is up-to-date or ahead of the remote storage one, @@ -390,7 +404,6 @@ pub mod repo_harness { let tenant_id = ZTenantId::generate(); fs::create_dir_all(conf.tenant_path(&tenant_id))?; - fs::create_dir_all(conf.branches_path(&tenant_id))?; Ok(Self { conf, tenant_id }) } diff --git a/pageserver/src/tenant_mgr.rs b/pageserver/src/tenant_mgr.rs index d60b5fefd3..568088fc1d 100644 --- a/pageserver/src/tenant_mgr.rs +++ b/pageserver/src/tenant_mgr.rs @@ -1,19 +1,19 @@ //! This module acts as a switchboard to access different repositories managed by this //! page server. -use crate::branches; use crate::config::PageServerConf; use crate::layered_repository::LayeredRepository; use crate::repository::{Repository, Timeline, TimelineSyncState}; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; +use crate::timelines; use crate::walredo::PostgresRedoManager; use crate::CheckpointConfig; -use anyhow::{bail, Context, Result}; +use anyhow::{Context, Result}; use lazy_static::lazy_static; use log::*; use serde::{Deserialize, Serialize}; -use std::collections::{hash_map, HashMap}; +use std::collections::HashMap; use std::fmt; use std::sync::{Arc, Mutex, MutexGuard}; use zenith_utils::zid::{ZTenantId, ZTimelineId}; @@ -177,24 +177,27 @@ pub fn shutdown_all_tenants() { } } -pub fn create_repository_for_tenant( +pub fn create_tenant_repository( conf: &'static PageServerConf, - tenantid: ZTenantId, -) -> Result<()> { - let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid)); - let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?; - - match access_tenants().entry(tenantid) { - hash_map::Entry::Occupied(_) => bail!("tenant {} already exists", tenantid), - hash_map::Entry::Vacant(v) => { - v.insert(Tenant { - state: TenantState::Idle, - repo, - }); + new_tenant_id: Option, +) -> Result> { + let new_tenant_id = new_tenant_id.unwrap_or_else(ZTenantId::generate); + let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, new_tenant_id)); + match timelines::create_repo(conf, new_tenant_id, wal_redo_manager)? { + Some(repo) => { + access_tenants() + .entry(new_tenant_id) + .or_insert_with(|| Tenant { + state: TenantState::Idle, + repo, + }); + Ok(Some(new_tenant_id)) + } + None => { + debug!("repository already exists for tenant {}", new_tenant_id); + Ok(None) } } - - Ok(()) } pub fn get_tenant_state(tenantid: ZTenantId) -> Option { diff --git a/pageserver/src/timelines.rs b/pageserver/src/timelines.rs new file mode 100644 index 0000000000..4de131ef70 --- /dev/null +++ b/pageserver/src/timelines.rs @@ -0,0 +1,408 @@ +//! +//! Timeline management code +// + +use anyhow::{anyhow, bail, Context, Result}; +use postgres_ffi::ControlFileData; +use std::{ + fs, + path::Path, + process::{Command, Stdio}, + sync::Arc, +}; +use tracing::*; + +use zenith_utils::lsn::Lsn; +use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::{crashsafe_dir, logging}; + +use crate::{config::PageServerConf, repository::Repository}; +use crate::{import_datadir, LOG_FILE_NAME}; +use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager}; +use crate::{repository::RepositoryTimeline, tenant_mgr}; +use crate::{repository::Timeline, CheckpointConfig}; + +#[derive(Clone)] +pub enum TimelineInfo { + Local { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + last_record_lsn: Lsn, + prev_record_lsn: Lsn, + ancestor_timeline_id: Option, + ancestor_lsn: Option, + disk_consistent_lsn: Lsn, + current_logical_size: usize, + current_logical_size_non_incremental: Option, + }, + Remote { + timeline_id: ZTimelineId, + tenant_id: ZTenantId, + disk_consistent_lsn: Lsn, + }, +} + +impl TimelineInfo { + pub fn from_repo_timeline( + tenant_id: ZTenantId, + repo_timeline: RepositoryTimeline, + include_non_incremental_logical_size: bool, + ) -> Self { + match repo_timeline { + RepositoryTimeline::Local { id, timeline } => { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id: id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline.as_ref(), + ), + } + } + RepositoryTimeline::Remote { + id, + disk_consistent_lsn, + } => Self::Remote { + timeline_id: id, + tenant_id, + disk_consistent_lsn, + }, + } + } + + pub fn from_dyn_timeline( + tenant_id: ZTenantId, + timeline_id: ZTimelineId, + timeline: &dyn Timeline, + include_non_incremental_logical_size: bool, + ) -> Self { + let ancestor_timeline_id = timeline.get_ancestor_timeline_id(); + let ancestor_lsn = if ancestor_timeline_id.is_some() { + Some(timeline.get_ancestor_lsn()) + } else { + None + }; + + Self::Local { + timeline_id, + tenant_id, + last_record_lsn: timeline.get_last_record_lsn(), + prev_record_lsn: timeline.get_prev_record_lsn(), + ancestor_timeline_id, + ancestor_lsn, + disk_consistent_lsn: timeline.get_disk_consistent_lsn(), + current_logical_size: timeline.get_current_logical_size(), + current_logical_size_non_incremental: get_current_logical_size_non_incremental( + include_non_incremental_logical_size, + timeline, + ), + } + } + + pub fn timeline_id(&self) -> ZTimelineId { + match *self { + TimelineInfo::Local { timeline_id, .. } => timeline_id, + TimelineInfo::Remote { timeline_id, .. } => timeline_id, + } + } + + pub fn tenant_id(&self) -> ZTenantId { + match *self { + TimelineInfo::Local { tenant_id, .. } => tenant_id, + TimelineInfo::Remote { tenant_id, .. } => tenant_id, + } + } +} + +fn get_current_logical_size_non_incremental( + include_non_incremental_logical_size: bool, + timeline: &dyn Timeline, +) -> Option { + if !include_non_incremental_logical_size { + return None; + } + match timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn()) { + Ok(size) => Some(size), + Err(e) => { + error!("Failed to get non-incremental logical size: {:?}", e); + None + } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct PointInTime { + pub timeline_id: ZTimelineId, + pub lsn: Lsn, +} + +pub fn init_pageserver( + conf: &'static PageServerConf, + create_tenant: Option, + initial_timeline_id: Option, +) -> anyhow::Result<()> { + // Initialize logger + // use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages + let _log_file = logging::init(LOG_FILE_NAME, true)?; + + // We don't use the real WAL redo manager, because we don't want to spawn the WAL redo + // process during repository initialization. + // + // FIXME: That caused trouble, because the WAL redo manager spawned a thread that launched + // initdb in the background, and it kept running even after the "zenith init" had exited. + // In tests, we started the page server immediately after that, so that initdb was still + // running in the background, and we failed to run initdb again in the same directory. This + // has been solved for the rapid init+start case now, but the general race condition remains + // if you restart the server quickly. The WAL redo manager doesn't use a separate thread + // anymore, but I think that could still happen. + let dummy_redo_mgr = Arc::new(crate::walredo::DummyRedoManager {}); + + crashsafe_dir::create_dir_all(conf.tenants_path())?; + + if let Some(tenant_id) = create_tenant { + println!("initializing tenantid {}", tenant_id); + let repo = create_repo(conf, tenant_id, dummy_redo_mgr) + .context("failed to create repo")? + .ok_or_else(|| anyhow!("For newely created pageserver, found already existing repository for tenant {}", tenant_id))?; + let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate); + bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref()) + .context("failed to create initial timeline")?; + println!("initial timeline {} created", new_timeline_id) + } else if initial_timeline_id.is_some() { + println!("Ignoring initial timeline parameter, due to no tenant id to create given"); + } + + println!("pageserver init succeeded"); + Ok(()) +} + +pub fn create_repo( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + wal_redo_manager: Arc, +) -> Result>> { + let repo_dir = conf.tenant_path(&tenant_id); + if repo_dir.exists() { + debug!("repo for {} already exists", tenant_id); + return Ok(None); + } + + // top-level dir may exist if we are creating it through CLI + crashsafe_dir::create_dir_all(&repo_dir) + .with_context(|| format!("could not create directory {}", repo_dir.display()))?; + crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?; + info!("created directory structure in {}", repo_dir.display()); + + Ok(Some(Arc::new(LayeredRepository::new( + conf, + wal_redo_manager, + tenant_id, + conf.remote_storage_config.is_some(), + )))) +} + +// Returns checkpoint LSN from controlfile +fn get_lsn_from_controlfile(path: &Path) -> Result { + // Read control file to extract the LSN + let controlfile_path = path.join("global").join("pg_control"); + let controlfile = ControlFileData::decode(&fs::read(controlfile_path)?)?; + let lsn = controlfile.checkPoint; + + Ok(Lsn(lsn)) +} + +// Create the cluster temporarily in 'initdbpath' directory inside the repository +// to get bootstrap data for timeline initialization. +// +fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> { + info!("running initdb in {}... ", initdbpath.display()); + + let initdb_path = conf.pg_bin_dir().join("initdb"); + let initdb_output = Command::new(initdb_path) + .args(&["-D", initdbpath.to_str().unwrap()]) + .args(&["-U", &conf.superuser]) + .args(&["-E", "utf8"]) + .arg("--no-instructions") + // This is only used for a temporary installation that is deleted shortly after, + // so no need to fsync it + .arg("--no-sync") + .env_clear() + .env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap()) + .stdout(Stdio::null()) + .output() + .context("failed to execute initdb")?; + if !initdb_output.status.success() { + bail!( + "initdb failed: '{}'", + String::from_utf8_lossy(&initdb_output.stderr) + ); + } + + Ok(()) +} + +// +// - run initdb to init temporary instance and get bootstrap data +// - after initialization complete, remove the temp dir. +// +fn bootstrap_timeline( + conf: &'static PageServerConf, + tenantid: ZTenantId, + tli: ZTimelineId, + repo: &dyn Repository, +) -> Result> { + let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered(); + + let initdb_path = conf.tenant_path(&tenantid).join("tmp"); + + // Init temporarily repo to get bootstrap data + run_initdb(conf, &initdb_path)?; + let pgdata_path = initdb_path; + + let lsn = get_lsn_from_controlfile(&pgdata_path)?.align(); + + // Import the contents of the data directory at the initial checkpoint + // LSN, and any WAL after that. + // Initdb lsn will be equal to last_record_lsn which will be set after import. + // Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline. + let timeline = repo.create_empty_timeline(tli, lsn)?; + import_datadir::import_timeline_from_postgres_datadir( + &pgdata_path, + timeline.writer().as_ref(), + lsn, + )?; + timeline.checkpoint(CheckpointConfig::Forced)?; + + println!( + "created initial timeline {} timeline.lsn {}", + tli, + timeline.get_last_record_lsn() + ); + + // Remove temp dir. We don't need it anymore + fs::remove_dir_all(pgdata_path)?; + + Ok(timeline) +} + +pub(crate) fn get_timelines( + tenant_id: ZTenantId, + include_non_incremental_logical_size: bool, +) -> Result> { + let repo = tenant_mgr::get_repository_for_tenant(tenant_id) + .with_context(|| format!("Failed to get repo for tenant {}", tenant_id))?; + + Ok(repo + .list_timelines() + .with_context(|| format!("Failed to list timelines for tenant {}", tenant_id))? + .into_iter() + .filter_map(|timeline| match timeline { + RepositoryTimeline::Local { timeline, id } => Some((id, timeline)), + RepositoryTimeline::Remote { .. } => None, + }) + .map(|(timeline_id, timeline)| { + TimelineInfo::from_dyn_timeline( + tenant_id, + timeline_id, + timeline.as_ref(), + include_non_incremental_logical_size, + ) + }) + .collect()) +} + +pub(crate) fn create_timeline( + conf: &'static PageServerConf, + tenant_id: ZTenantId, + new_timeline_id: Option, + ancestor_timeline_id: Option, + ancestor_start_lsn: Option, +) -> Result> { + let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate); + let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?; + + if conf.timeline_path(&new_timeline_id, &tenant_id).exists() { + match repo.get_timeline(new_timeline_id)? { + RepositoryTimeline::Local { id, .. } => { + debug!("timeline {} already exists", id); + return Ok(None); + } + RepositoryTimeline::Remote { id, .. } => bail!( + "timeline {} already exists in pageserver's remote storage", + id + ), + } + } + + let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0)); + + let new_timeline_info = match ancestor_timeline_id { + Some(ancestor_timeline_id) => { + let ancestor_timeline = repo + .get_timeline(ancestor_timeline_id) + .with_context(|| format!("Cannot get ancestor timeline {}", ancestor_timeline_id))? + .local_timeline() + .with_context(|| { + format!( + "Cannot branch off the timeline {} that's not present locally", + ancestor_timeline_id + ) + })?; + + if start_lsn == Lsn(0) { + // Find end of WAL on the old timeline + let end_of_wal = ancestor_timeline.get_last_record_lsn(); + info!("branching at end of WAL: {}", end_of_wal); + start_lsn = end_of_wal; + } else { + // Wait for the WAL to arrive and be processed on the parent branch up + // to the requested branch point. The repository code itself doesn't + // require it, but if we start to receive WAL on the new timeline, + // decoding the new WAL might need to look up previous pages, relation + // sizes etc. and that would get confused if the previous page versions + // are not in the repository yet. + ancestor_timeline.wait_lsn(start_lsn)?; + } + start_lsn = start_lsn.align(); + + let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn(); + if ancestor_ancestor_lsn > start_lsn { + // can we safely just branch from the ancestor instead? + anyhow::bail!( + "invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}", + start_lsn, + ancestor_timeline_id, + ancestor_ancestor_lsn, + ); + } + repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?; + // load the timeline into memory + let loaded_timeline = repo.get_timeline(new_timeline_id)?; + TimelineInfo::from_repo_timeline(tenant_id, loaded_timeline, false) + } + None => { + let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?; + TimelineInfo::from_dyn_timeline( + tenant_id, + new_timeline_id, + new_timeline.as_ref(), + false, + ) + } + }; + Ok(Some(new_timeline_info)) +} diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index 378a015d4a..ca9107cdbf 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -268,12 +268,11 @@ impl XlXactParsedRecord { let info = xl_info & pg_constants::XLOG_XACT_OPMASK; // The record starts with time of commit/abort let xact_time = buf.get_i64_le(); - let xinfo; - if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { - xinfo = buf.get_u32_le(); + let xinfo = if xl_info & pg_constants::XLOG_XACT_HAS_INFO != 0 { + buf.get_u32_le() } else { - xinfo = 0; - } + 0 + }; let db_id; let ts_id; if xinfo & pg_constants::XACT_XINFO_HAS_DBINFO != 0 { @@ -502,7 +501,6 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { 0..=pg_constants::XLR_MAX_BLOCK_ID => { /* XLogRecordBlockHeader */ let mut blk = DecodedBkpBlock::new(); - let fork_flags: u8; if block_id <= max_block_id { // TODO @@ -515,7 +513,7 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord { } max_block_id = block_id; - fork_flags = buf.get_u8(); + let fork_flags: u8 = buf.get_u8(); blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; blk.flags = fork_flags; blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; diff --git a/postgres_ffi/src/xlog_utils.rs b/postgres_ffi/src/xlog_utils.rs index caf1940a9c..d2b2b5c122 100644 --- a/postgres_ffi/src/xlog_utils.rs +++ b/postgres_ffi/src/xlog_utils.rs @@ -132,6 +132,8 @@ pub fn get_current_timestamp() -> TimestampTz { } } +/// Return offset of the last valid record in the segment segno, starting +/// looking at start_offset. Returns start_offset if no records found. fn find_end_of_wal_segment( data_dir: &Path, segno: XLogSegNo, @@ -147,7 +149,7 @@ fn find_end_of_wal_segment( let mut rec_offs: usize = 0; let mut buf = [0u8; XLOG_BLCKSZ]; let file_name = XLogFileName(tli, segno, wal_seg_size); - let mut last_valid_rec_pos: usize = 0; + let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap(); file.seek(SeekFrom::Start(offs as u64))?; let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS]; diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index d8d5cbe5bf..dda018a1d8 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" anyhow = "1.0" bytes = { version = "1.0.1", features = ['serde'] } clap = "3.0" +fail = "0.5.0" futures = "0.3.13" hashbrown = "0.11.2" hex = "0.4.3" @@ -21,6 +22,7 @@ rustls = "0.19.1" scopeguard = "1.1.0" serde = "1" serde_json = "1" +thiserror = "1.0" tokio = { version = "1.11", features = ["macros"] } tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" } tokio-rustls = "0.22.0" diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index a5bdaeaeca..5e6357fe80 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -1,11 +1,79 @@ use crate::compute::DatabaseInfo; use crate::config::ProxyConfig; use crate::cplane_api::{self, CPlaneApi}; +use crate::error::UserFacingError; use crate::stream::PqStream; -use anyhow::{anyhow, bail, Context}; +use crate::waiters; use std::collections::HashMap; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage, FeMessage as Fe}; +use zenith_utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage}; + +/// Common authentication error. +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error(transparent)] + Console(#[from] cplane_api::AuthError), + + /// For passwords that couldn't be processed by [`parse_password`]. + #[error("Malformed password message")] + MalformedPassword, + + /// Errors produced by [`PqStream`]. + #[error(transparent)] + Io(#[from] std::io::Error), +} + +impl AuthErrorImpl { + pub fn auth_failed(msg: impl Into) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::auth_failed(msg)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::RegisterError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +impl From for AuthErrorImpl { + fn from(e: waiters::WaitError) -> Self { + AuthErrorImpl::Console(cplane_api::AuthError::from(e)) + } +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + Console(e) => e.to_string_client(), + MalformedPassword => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +#[derive(Debug, Error)] +pub enum ClientCredsParseError { + #[error("Parameter `{0}` is missing in startup packet")] + MissingKey(&'static str), +} + +impl UserFacingError for ClientCredsParseError {} /// Various client credentials which we use for authentication. #[derive(Debug, PartialEq, Eq)] @@ -15,13 +83,13 @@ pub struct ClientCredentials { } impl TryFrom> for ClientCredentials { - type Error = anyhow::Error; + type Error = ClientCredsParseError; fn try_from(mut value: HashMap) -> Result { let mut get_param = |key| { value .remove(key) - .with_context(|| format!("{} is missing in startup packet", key)) + .ok_or(ClientCredsParseError::MissingKey(key)) }; let user = get_param("user")?; @@ -37,10 +105,14 @@ impl ClientCredentials { self, config: &ProxyConfig, client: &mut PqStream, - ) -> anyhow::Result { + ) -> Result { + fail::fail_point!("proxy-authenticate", |_| { + Err(AuthError::auth_failed("failpoint triggered")) + }); + use crate::config::ClientAuthMethod::*; use crate::config::RouterConfig::*; - let db_info = match &config.router_config { + match &config.router_config { Static { host, port } => handle_static(host.clone(), *port, client, self).await, Dynamic(Mixed) => { if self.user.ends_with("@zenith") { @@ -51,9 +123,7 @@ impl ClientCredentials { } Dynamic(Password) => handle_existing_user(config, client, self).await, Dynamic(Link) => handle_new_user(config, client).await, - }; - - db_info.context("failed to authenticate client") + } } } @@ -66,18 +136,14 @@ async fn handle_static( port: u16, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { client .write_message(&Be::AuthenticationCleartextPassword) .await?; // Read client's password bytes - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; - - let cleartext_password = std::str::from_utf8(&msg)?.split('\0').next().unwrap(); + let msg = client.read_password_message().await?; + let cleartext_password = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; let db_info = DatabaseInfo { host, @@ -98,7 +164,7 @@ async fn handle_existing_user( config: &ProxyConfig, client: &mut PqStream, creds: ClientCredentials, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let md5_salt = rand::random(); @@ -107,18 +173,12 @@ async fn handle_existing_user( .await?; // Read client's password hash - let msg = match client.read_message().await? { - Fe::PasswordMessage(msg) => msg, - bad => bail!("unexpected message type: {:?}", bad), - }; + let msg = client.read_password_message().await?; + let md5_response = parse_password(&msg).ok_or(AuthErrorImpl::MalformedPassword)?; - let (_trailing_null, md5_response) = msg - .split_last() - .ok_or_else(|| anyhow!("unexpected password message"))?; - - let cplane = CPlaneApi::new(&config.auth_endpoint); + let cplane = CPlaneApi::new(config.auth_endpoint.clone()); let db_info = cplane - .authenticate_proxy_request(creds, md5_response, &md5_salt, &psql_session_id) + .authenticate_proxy_client(creds, md5_response, &md5_salt, &psql_session_id) .await?; client @@ -131,7 +191,7 @@ async fn handle_existing_user( async fn handle_new_user( config: &ProxyConfig, client: &mut PqStream, -) -> anyhow::Result { +) -> Result { let psql_session_id = new_psql_session_id(); let greeting = hello_message(&config.redirect_uri, &psql_session_id); @@ -143,8 +203,8 @@ async fn handle_new_user( .write_message(&Be::NoticeResponse(greeting)) .await?; - // Wait for web console response - waiter.await?.map_err(|e| anyhow!(e)) + // Wait for web console response (see `mgmt`) + waiter.await?.map_err(AuthErrorImpl::auth_failed) }) .await?; @@ -153,6 +213,10 @@ async fn handle_new_user( Ok(db_info) } +fn parse_password(bytes: &[u8]) -> Option<&str> { + std::str::from_utf8(bytes).ok()?.strip_suffix('\0') +} + fn hello_message(redirect_uri: &str, session_id: &str) -> String { format!( concat![ diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c1a7e81be9..07d3bcc71a 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,7 +6,7 @@ use tokio::net::TcpStream; use tokio_postgres::{CancelToken, NoTls}; use zenith_utils::pq_proto::CancelKeyData; -/// Enables serving CancelRequests. +/// Enables serving `CancelRequest`s. #[derive(Default)] pub struct CancelMap(Mutex>>); diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 7c294bd488..64ce5d0a5a 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,6 +1,27 @@ -use anyhow::Context; +use crate::cancellation::CancelClosure; +use crate::error::UserFacingError; use serde::{Deserialize, Serialize}; -use std::net::{SocketAddr, ToSocketAddrs}; +use std::io; +use std::net::SocketAddr; +use thiserror::Error; +use tokio::net::TcpStream; +use tokio_postgres::NoTls; + +#[derive(Debug, Error)] +pub enum ConnectionError { + /// This error doesn't seem to reveal any secrets; for instance, + /// [`tokio_postgres::error::Kind`] doesn't contain ip addresses and such. + #[error("Failed to connect to the compute node: {0}")] + Postgres(#[from] tokio_postgres::Error), + + #[error("Failed to connect to the compute node")] + FailedToConnectToCompute, + + #[error("Failed to fetch compute node version")] + FailedToFetchPgVersion, +} + +impl UserFacingError for ConnectionError {} /// Compute node connection params. #[derive(Serialize, Deserialize, Debug, Default)] @@ -12,14 +33,38 @@ pub struct DatabaseInfo { pub password: Option, } +/// PostgreSQL version as [`String`]. +pub type Version = String; + impl DatabaseInfo { - pub fn socket_addr(&self) -> anyhow::Result { + async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> { let host_port = format!("{}:{}", self.host, self.port); - host_port - .to_socket_addrs() - .with_context(|| format!("cannot resolve {} to SocketAddr", host_port))? - .next() - .context("cannot resolve at least one SocketAddr") + let socket = TcpStream::connect(host_port).await?; + let socket_addr = socket.peer_addr()?; + + Ok((socket_addr, socket)) + } + + /// Connect to a corresponding compute node. + pub async fn connect(self) -> Result<(TcpStream, Version, CancelClosure), ConnectionError> { + let (socket_addr, mut socket) = self + .connect_raw() + .await + .map_err(|_| ConnectionError::FailedToConnectToCompute)?; + + // TODO: establish a secure connection to the DB + let (client, conn) = tokio_postgres::Config::from(self) + .connect_raw(&mut socket, NoTls) + .await?; + + let version = conn + .parameter("server_version") + .ok_or(ConnectionError::FailedToFetchPgVersion)? + .into(); + + let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + + Ok((socket, version, cancel_closure)) } } diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 9ab64db795..077ff02898 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, ensure, Context}; +use anyhow::{anyhow, bail, ensure, Context}; use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig}; use std::net::SocketAddr; use std::str::FromStr; @@ -29,7 +29,7 @@ impl FromStr for ClientAuthMethod { "password" => Ok(Password), "link" => Ok(Link), "mixed" => Ok(Mixed), - _ => Err(anyhow::anyhow!("Invlid option for router")), + _ => bail!("Invalid option for router: `{}`", s), } } } @@ -53,7 +53,7 @@ pub struct ProxyConfig { pub redirect_uri: String, /// control plane address where we would check auth. - pub auth_endpoint: String, + pub auth_endpoint: reqwest::Url, pub tls_config: Option, } diff --git a/proxy/src/cplane_api.rs b/proxy/src/cplane_api.rs index 187809717f..21fce79df3 100644 --- a/proxy/src/cplane_api.rs +++ b/proxy/src/cplane_api.rs @@ -1,52 +1,113 @@ use crate::auth::ClientCredentials; use crate::compute::DatabaseInfo; -use crate::waiters::{Waiter, Waiters}; -use anyhow::{anyhow, bail}; +use crate::error::UserFacingError; +use crate::mgmt; +use crate::waiters::{self, Waiter, Waiters}; use lazy_static::lazy_static; use serde::{Deserialize, Serialize}; +use thiserror::Error; lazy_static! { - static ref CPLANE_WAITERS: Waiters> = Default::default(); + static ref CPLANE_WAITERS: Waiters = Default::default(); } /// Give caller an opportunity to wait for cplane's reply. -pub async fn with_waiter(psql_session_id: impl Into, f: F) -> anyhow::Result +pub async fn with_waiter( + psql_session_id: impl Into, + action: impl FnOnce(Waiter<'static, mgmt::ComputeReady>) -> R, +) -> Result where - F: FnOnce(Waiter<'static, Result>) -> R, - R: std::future::Future>, + R: std::future::Future>, + E: From, { let waiter = CPLANE_WAITERS.register(psql_session_id.into())?; - f(waiter).await + action(waiter).await } -pub fn notify(psql_session_id: &str, msg: Result) -> anyhow::Result<()> { +pub fn notify( + psql_session_id: &str, + msg: Result, +) -> Result<(), waiters::NotifyError> { CPLANE_WAITERS.notify(psql_session_id, msg) } /// Zenith console API wrapper. -pub struct CPlaneApi<'a> { - auth_endpoint: &'a str, +pub struct CPlaneApi { + auth_endpoint: reqwest::Url, } -impl<'a> CPlaneApi<'a> { - pub fn new(auth_endpoint: &'a str) -> Self { +impl CPlaneApi { + pub fn new(auth_endpoint: reqwest::Url) -> Self { Self { auth_endpoint } } } -impl CPlaneApi<'_> { - pub async fn authenticate_proxy_request( +#[derive(Debug, Error)] +pub enum AuthErrorImpl { + /// Authentication error reported by the console. + #[error("Authentication failed: {0}")] + AuthFailed(String), + + /// HTTP status (other than 200) returned by the console. + #[error("Console responded with an HTTP status: {0}")] + HttpStatus(reqwest::StatusCode), + + #[error("Console responded with a malformed JSON: {0}")] + MalformedResponse(#[from] serde_json::Error), + + #[error(transparent)] + Transport(#[from] reqwest::Error), + + #[error(transparent)] + WaiterRegister(#[from] waiters::RegisterError), + + #[error(transparent)] + WaiterWait(#[from] waiters::WaitError), +} + +#[derive(Debug, Error)] +#[error(transparent)] +pub struct AuthError(Box); + +impl AuthError { + /// Smart constructor for authentication error reported by `mgmt`. + pub fn auth_failed(msg: impl Into) -> Self { + AuthError(Box::new(AuthErrorImpl::AuthFailed(msg.into()))) + } +} + +impl From for AuthError +where + AuthErrorImpl: From, +{ + fn from(e: T) -> Self { + AuthError(Box::new(e.into())) + } +} + +impl UserFacingError for AuthError { + fn to_string_client(&self) -> String { + use AuthErrorImpl::*; + match self.0.as_ref() { + AuthFailed(_) | HttpStatus(_) => self.to_string(), + _ => "Internal error".to_string(), + } + } +} + +impl CPlaneApi { + pub async fn authenticate_proxy_client( &self, creds: ClientCredentials, - md5_response: &[u8], + md5_response: &str, salt: &[u8; 4], psql_session_id: &str, - ) -> anyhow::Result { - let mut url = reqwest::Url::parse(self.auth_endpoint)?; + ) -> Result { + let mut url = self.auth_endpoint.clone(); url.query_pairs_mut() .append_pair("login", &creds.user) .append_pair("database", &creds.dbname) - .append_pair("md5response", std::str::from_utf8(md5_response)?) + .append_pair("md5response", md5_response) .append_pair("salt", &hex::encode(salt)) .append_pair("psql_session_id", psql_session_id); @@ -55,18 +116,20 @@ impl CPlaneApi<'_> { // TODO: leverage `reqwest::Client` to reuse connections let resp = reqwest::get(url).await?; if !resp.status().is_success() { - bail!("Auth failed: {}", resp.status()) + return Err(AuthErrorImpl::HttpStatus(resp.status()).into()); } let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?; println!("got auth info: #{:?}", auth_info); use ProxyAuthResponse::*; - match auth_info { - Ready { conn_info } => Ok(conn_info), - Error { error } => bail!(error), - NotReady { .. } => waiter.await?.map_err(|e| anyhow!(e)), - } + let db_info = match auth_info { + Ready { conn_info } => conn_info, + Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()), + NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?, + }; + + Ok(db_info) }) .await } diff --git a/proxy/src/error.rs b/proxy/src/error.rs new file mode 100644 index 0000000000..e98e553f83 --- /dev/null +++ b/proxy/src/error.rs @@ -0,0 +1,17 @@ +/// Marks errors that may be safely shown to a client. +/// This trait can be seen as a specialized version of [`ToString`]. +/// +/// NOTE: This trait should not be implemented for [`anyhow::Error`], since it +/// is way too convenient and tends to proliferate all across the codebase, +/// ultimately leading to accidental leaks of sensitive data. +pub trait UserFacingError: ToString { + /// Format the error for client, stripping all sensitive info. + /// + /// Although this might be a no-op for many types, it's highly + /// recommended to override the default impl in case error type + /// contains anything sensitive: various IDs, IP addresses etc. + #[inline(always)] + fn to_string_client(&self) -> String { + self.to_string() + } +} diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 0b693d88dd..33d134678f 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -7,7 +7,7 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::{RouterBuilder, RouterService}; async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) + json_response(StatusCode::OK, "") } fn make_router() -> RouterBuilder { diff --git a/proxy/src/main.rs b/proxy/src/main.rs index fb3bf725b8..bd99d0a639 100644 --- a/proxy/src/main.rs +++ b/proxy/src/main.rs @@ -20,13 +20,14 @@ mod cancellation; mod compute; mod config; mod cplane_api; +mod error; mod http; mod mgmt; mod proxy; mod stream; mod waiters; -/// Flattens Result> into Result. +/// Flattens `Result>` into `Result`. async fn flatten_err( f: impl Future, JoinError>>, ) -> anyhow::Result<()> { @@ -122,7 +123,7 @@ async fn main() -> anyhow::Result<()> { None => RouterConfig::Dynamic(auth_method), Some(addr) => { if let ClientAuthMethod::Password = auth_method { - let (host, port) = addr.split_once(":").unwrap(); + let (host, port) = addr.split_once(':').unwrap(); RouterConfig::Static { host: host.to_string(), port: port.parse().unwrap(), diff --git a/proxy/src/mgmt.rs b/proxy/src/mgmt.rs index 55b49b441f..e53542dfd2 100644 --- a/proxy/src/mgmt.rs +++ b/proxy/src/mgmt.rs @@ -79,6 +79,18 @@ enum PsqlSessionResult { Failure(String), } +/// A message received by `mgmt` when a compute node is ready. +pub type ComputeReady = Result; + +impl PsqlSessionResult { + fn into_compute_ready(self) -> ComputeReady { + match self { + Self::Success(db_info) => Ok(db_info), + Self::Failure(message) => Err(message), + } + } +} + impl postgres_backend::Handler for MgmtHandler { fn process_query( &mut self, @@ -99,13 +111,7 @@ fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::R let resp: PsqlSessionResponse = serde_json::from_str(query_string)?; - use PsqlSessionResult::*; - let msg = match resp.result { - Success(db_info) => Ok(db_info), - Failure(message) => Err(message), - }; - - match cplane_api::notify(&resp.session_id, msg) { + match cplane_api::notify(&resp.session_id, resp.result.into_compute_ready()) { Ok(()) => { pgb.write_message_noflush(&SINGLE_COL_ROWDESC)? .write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))? diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 1dc301b792..3c7f59bc26 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,17 +1,18 @@ use crate::auth; -use crate::cancellation::{self, CancelClosure, CancelMap}; -use crate::compute::DatabaseInfo; +use crate::cancellation::{self, CancelMap}; use crate::config::{ProxyConfig, TlsConfig}; use crate::stream::{MetricsStream, PqStream, Stream}; use anyhow::{bail, Context}; +use futures::TryFutureExt; use lazy_static::lazy_static; use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio::net::TcpStream; -use tokio_postgres::NoTls; use zenith_metrics::{new_common_metric_name, register_int_counter, IntCounter}; use zenith_utils::pq_proto::{BeMessage as Be, *}; +const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; +const ERR_PROTO_VIOLATION: &str = "protocol violation"; + lazy_static! { static ref NUM_CONNECTIONS_ACCEPTED_COUNTER: IntCounter = register_int_counter!( new_common_metric_name("num_connections_accepted"), @@ -30,6 +31,7 @@ lazy_static! { .unwrap(); } +/// A small combinator for pluggable error logging. async fn log_error(future: F) -> F::Output where F: std::future::Future>, @@ -76,20 +78,21 @@ async fn handle_client( } let tls = config.tls_config.clone(); - if let Some((client, creds)) = handshake(stream, tls, cancel_map).await? { - cancel_map - .with_session(|session| async { - connect_client_to_db(config, session, client, creds).await - }) - .await?; - } + let (stream, creds) = match handshake(stream, tls, cancel_map).await? { + Some(x) => x, + None => return Ok(()), // it's a cancellation request + }; - Ok(()) + let client = Client::new(stream, creds); + cancel_map + .with_session(|session| client.connect_to_db(config, session)) + .await } -/// Handle a connection from one client. -/// For better testing experience, `stream` can be -/// any object satisfying the traits. +/// Establish a (most probably, secure) connection with the client. +/// For better testing experience, `stream` can be any object satisfying the traits. +/// It's easier to work with owned `stream` here as we need to updgrade it to TLS; +/// we also take an extra care of propagating only the select handshake errors to client. async fn handshake( stream: S, mut tls: Option, @@ -119,7 +122,7 @@ async fn handshake( stream = PqStream::new(stream.into_inner().upgrade(tls).await?); } } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, GssEncRequest => match stream.get_ref() { Stream::Raw { .. } if !tried_gss => { @@ -128,18 +131,21 @@ async fn handshake( // Currently, we don't support GSSAPI stream.write_message(&Be::EncryptionResponse(false)).await?; } - _ => bail!("protocol violation"), + _ => bail!(ERR_PROTO_VIOLATION), }, StartupMessage { params, .. } => { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { - let msg = "connection is insecure (try using `sslmode=require`)"; - stream.write_message(&Be::ErrorResponse(msg)).await?; - bail!(msg); + stream.throw_error_str(ERR_INSECURE_CONNECTION).await?; } - break Ok(Some((stream, params.try_into()?))); + // Here and forth: `or_else` demands that we use a future here + let creds = async { params.try_into() } + .or_else(|e| stream.throw_error(e)) + .await?; + + break Ok(Some((stream, creds))); } CancelRequest(cancel_key_data) => { cancel_map.cancel_session(cancel_key_data).await?; @@ -150,58 +156,60 @@ async fn handshake( } } -async fn connect_client_to_db( - config: &ProxyConfig, - session: cancellation::Session<'_>, - mut client: PqStream, +/// Thin connection context. +struct Client { + /// The underlying libpq protocol stream. + stream: PqStream, + /// Client credentials that we care about. creds: auth::ClientCredentials, -) -> anyhow::Result<()> { - let db_info = creds.authenticate(config, &mut client).await?; - let (db, version, cancel_closure) = connect_to_db(db_info).await?; - let cancel_key_data = session.enable_cancellation(cancel_closure); - - client - .write_message_noflush(&BeMessage::ParameterStatus( - BeParameterStatusMessage::ServerVersion(&version), - ))? - .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? - .write_message(&BeMessage::ReadyForQuery) - .await?; - - // This function will be called for writes to either direction. - fn inc_proxied(cnt: usize) { - // Consider inventing something more sophisticated - // if this ever becomes a bottleneck (cacheline bouncing). - NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); - } - - let mut db = MetricsStream::new(db, inc_proxied); - let mut client = MetricsStream::new(client.into_inner(), inc_proxied); - let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; - - Ok(()) } -/// Connect to a corresponding compute node. -async fn connect_to_db( - db_info: DatabaseInfo, -) -> anyhow::Result<(TcpStream, String, CancelClosure)> { - // TODO: establish a secure connection to the DB - let socket_addr = db_info.socket_addr()?; - let mut socket = TcpStream::connect(socket_addr).await?; +impl Client { + /// Construct a new connection context. + fn new(stream: PqStream, creds: auth::ClientCredentials) -> Self { + Self { stream, creds } + } +} - let (client, conn) = tokio_postgres::Config::from(db_info) - .connect_raw(&mut socket, NoTls) - .await?; +impl Client { + /// Let the client authenticate and connect to the designated compute node. + async fn connect_to_db( + self, + config: &ProxyConfig, + session: cancellation::Session<'_>, + ) -> anyhow::Result<()> { + let Self { mut stream, creds } = self; - let version = conn - .parameter("server_version") - .context("failed to fetch postgres server version")? - .into(); + // Authenticate and connect to a compute node. + let auth = creds.authenticate(config, &mut stream).await; + let db_info = async { auth }.or_else(|e| stream.throw_error(e)).await?; - let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token()); + let (db, version, cancel_closure) = + db_info.connect().or_else(|e| stream.throw_error(e)).await?; + let cancel_key_data = session.enable_cancellation(cancel_closure); - Ok((socket, version, cancel_closure)) + stream + .write_message_noflush(&BeMessage::ParameterStatus( + BeParameterStatusMessage::ServerVersion(&version), + ))? + .write_message_noflush(&Be::BackendKeyData(cancel_key_data))? + .write_message(&BeMessage::ReadyForQuery) + .await?; + + /// This function will be called for writes to either direction. + fn inc_proxied(cnt: usize) { + // Consider inventing something more sophisticated + // if this ever becomes a bottleneck (cacheline bouncing). + NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64); + } + + // Starting from here we only proxy the client's traffic. + let mut db = MetricsStream::new(db, inc_proxied); + let mut client = MetricsStream::new(stream.into_inner(), inc_proxied); + let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?; + + Ok(()) + } } #[cfg(test)] @@ -210,7 +218,7 @@ mod tests { use tokio::io::DuplexStream; use tokio_postgres::config::SslMode; - use tokio_postgres::tls::MakeTlsConnect; + use tokio_postgres::tls::{MakeTlsConnect, NoTls}; use tokio_postgres_rustls::MakeRustlsConnect; async fn dummy_proxy( @@ -264,7 +272,7 @@ mod tests { let proxy = tokio::spawn(dummy_proxy(client, Some(server_config.into()))); - tokio_postgres::Config::new() + let client_err = tokio_postgres::Config::new() .user("john_doe") .dbname("earth") .ssl_mode(SslMode::Disable) @@ -273,11 +281,15 @@ mod tests { .err() // -> Option .context("client shouldn't be able to connect")?; - proxy + assert!(client_err.to_string().contains(ERR_INSECURE_CONNECTION)); + + let server_err = proxy .await? .err() // -> Option .context("server shouldn't accept client")?; + assert!(client_err.to_string().contains(&server_err.to_string())); + Ok(()) } @@ -329,4 +341,30 @@ mod tests { proxy.await? } + + #[tokio::test] + async fn give_user_an_error_for_bad_creds() -> anyhow::Result<()> { + let (client, server) = tokio::io::duplex(1024); + + let proxy = tokio::spawn(dummy_proxy(client, None)); + + let client_err = tokio_postgres::Config::new() + .ssl_mode(SslMode::Disable) + .connect_raw(server, NoTls) + .await + .err() // -> Option + .context("client shouldn't be able to connect")?; + + // TODO: this is ugly, but `format!` won't allow us to extract fmt string + assert!(client_err.to_string().contains("missing in startup packet")); + + let server_err = proxy + .await? + .err() // -> Option + .context("server shouldn't accept client")?; + + assert!(client_err.to_string().contains(&server_err.to_string())); + + Ok(()) + } } diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs index 8fd5bef388..fb0be84584 100644 --- a/proxy/src/stream.rs +++ b/proxy/src/stream.rs @@ -1,10 +1,12 @@ -use anyhow::Context; +use crate::error::UserFacingError; +use anyhow::bail; use bytes::BytesMut; use pin_project_lite::pin_project; use rustls::ServerConfig; use std::pin::Pin; use std::sync::Arc; use std::{io, task}; +use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, ReadBuf}; use tokio_rustls::server::TlsStream; use zenith_utils::pq_proto::{BeMessage, FeMessage, FeStartupPacket}; @@ -35,38 +37,63 @@ impl PqStream { self.stream } - /// Get a reference to the underlying stream. + /// Get a shared reference to the underlying stream. pub fn get_ref(&self) -> &S { &self.stream } } +fn err_connection() -> io::Error { + io::Error::new(io::ErrorKind::ConnectionAborted, "connection is lost") +} + +// TODO: change error type of `FeMessage::read_fut` +fn from_anyhow(e: anyhow::Error) -> io::Error { + io::Error::new(io::ErrorKind::Other, e.to_string()) +} + impl PqStream { /// Receive [`FeStartupPacket`], which is a first packet sent by a client. - pub async fn read_startup_packet(&mut self) -> anyhow::Result { - match FeStartupPacket::read_fut(&mut self.stream).await? { - Some(FeMessage::StartupPacket(packet)) => Ok(packet), - None => anyhow::bail!("connection is lost"), - other => anyhow::bail!("bad message type: {:?}", other), + pub async fn read_startup_packet(&mut self) -> io::Result { + // TODO: `FeStartupPacket::read_fut` should return `FeStartupPacket` + let msg = FeStartupPacket::read_fut(&mut self.stream) + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection)?; + + match msg { + FeMessage::StartupPacket(packet) => Ok(packet), + _ => panic!("unreachable state"), } } - pub async fn read_message(&mut self) -> anyhow::Result { + pub async fn read_password_message(&mut self) -> io::Result { + match self.read_message().await? { + FeMessage::PasswordMessage(msg) => Ok(msg), + bad => Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected message type: {:?}", bad), + )), + } + } + + async fn read_message(&mut self) -> io::Result { FeMessage::read_fut(&mut self.stream) - .await? - .context("connection is lost") + .await + .map_err(from_anyhow)? + .ok_or_else(err_connection) } } impl PqStream { /// Write the message into an internal buffer, but don't flush the underlying stream. - pub fn write_message_noflush<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub fn write_message_noflush(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { BeMessage::write(&mut self.buffer, message)?; Ok(self) } /// Write the message into an internal buffer and flush it. - pub async fn write_message<'a>(&mut self, message: &BeMessage<'a>) -> io::Result<&mut Self> { + pub async fn write_message(&mut self, message: &BeMessage<'_>) -> io::Result<&mut Self> { self.write_message_noflush(message)?; self.flush().await?; Ok(self) @@ -79,6 +106,25 @@ impl PqStream { self.stream.flush().await?; Ok(self) } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Allowing string literals is safe under the assumption they might not contain any runtime info. + pub async fn throw_error_str(&mut self, error: &'static str) -> anyhow::Result { + // This method exists due to `&str` not implementing `Into` + self.write_message(&BeMessage::ErrorResponse(error)).await?; + bail!(error) + } + + /// Write the error message using [`Self::write_message`], then re-throw it. + /// Trait [`UserFacingError`] acts as an allowlist for error types. + pub async fn throw_error(&mut self, error: E) -> anyhow::Result + where + E: UserFacingError + Into, + { + let msg = error.to_string_client(); + self.write_message(&BeMessage::ErrorResponse(&msg)).await?; + bail!(error) + } } pin_project! { @@ -101,15 +147,25 @@ impl Stream { } } +#[derive(Debug, Error)] +#[error("Can't upgrade TLS stream")] +pub enum StreamUpgradeError { + #[error("Bad state reached: can't upgrade TLS stream")] + AlreadyTls, + + #[error("Can't upgrade stream: IO error: {0}")] + Io(#[from] io::Error), +} + impl Stream { /// If possible, upgrade raw stream into a secure TLS-based stream. - pub async fn upgrade(self, cfg: Arc) -> anyhow::Result { + pub async fn upgrade(self, cfg: Arc) -> Result { match self { Stream::Raw { raw } => { let tls = Box::new(tokio_rustls::TlsAcceptor::from(cfg).accept(raw).await?); Ok(Stream::Tls { tls }) } - Stream::Tls { .. } => anyhow::bail!("can't upgrade TLS stream"), + Stream::Tls { .. } => Err(StreamUpgradeError::AlreadyTls), } } } diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs index 9fda3ed94f..799d45a165 100644 --- a/proxy/src/waiters.rs +++ b/proxy/src/waiters.rs @@ -1,11 +1,32 @@ -use anyhow::{anyhow, Context}; use hashbrown::HashMap; use parking_lot::Mutex; use pin_project_lite::pin_project; use std::pin::Pin; use std::task; +use thiserror::Error; use tokio::sync::oneshot; +#[derive(Debug, Error)] +pub enum RegisterError { + #[error("Waiter `{0}` already registered")] + Occupied(String), +} + +#[derive(Debug, Error)] +pub enum NotifyError { + #[error("Notify failed: waiter `{0}` not registered")] + NotFound(String), + + #[error("Notify failed: channel hangup")] + Hangup, +} + +#[derive(Debug, Error)] +pub enum WaitError { + #[error("Wait failed: channel hangup")] + Hangup, +} + pub struct Waiters(pub(self) Mutex>>); impl Default for Waiters { @@ -15,13 +36,13 @@ impl Default for Waiters { } impl Waiters { - pub fn register(&self, key: String) -> anyhow::Result> { + pub fn register(&self, key: String) -> Result, RegisterError> { let (tx, rx) = oneshot::channel(); self.0 .lock() .try_insert(key.clone(), tx) - .map_err(|_| anyhow!("waiter already registered"))?; + .map_err(|e| RegisterError::Occupied(e.entry.key().clone()))?; Ok(Waiter { receiver: rx, @@ -32,7 +53,7 @@ impl Waiters { }) } - pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()> + pub fn notify(&self, key: &str, value: T) -> Result<(), NotifyError> where T: Send + Sync, { @@ -40,9 +61,9 @@ impl Waiters { .0 .lock() .remove(key) - .with_context(|| format!("key {} not found", key))?; + .ok_or_else(|| NotifyError::NotFound(key.to_string()))?; - tx.send(value).map_err(|_| anyhow!("waiter channel hangup")) + tx.send(value).map_err(|_| NotifyError::Hangup) } } @@ -66,13 +87,13 @@ pin_project! { } impl std::future::Future for Waiter<'_, T> { - type Output = anyhow::Result; + type Output = Result; fn poll(self: Pin<&mut Self>, cx: &mut task::Context<'_>) -> task::Poll { self.project() .receiver .poll(cx) - .map_err(|_| anyhow!("channel hangup")) + .map_err(|_| WaitError::Hangup) } } diff --git a/test_runner/README.md b/test_runner/README.md index 514c5f1e3a..a56c2df2c0 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -89,7 +89,7 @@ def test_foobar(zenith_env_builder: ZenithEnvBuilder): # Now create the environment. This initializes the repository, and starts # up the page server and the safekeepers - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Run the test ... diff --git a/test_runner/batch_others/test_auth.py b/test_runner/batch_others/test_auth.py index 7f86986e2e..bda6349ef9 100644 --- a/test_runner/batch_others/test_auth.py +++ b/test_runner/batch_others/test_auth.py @@ -1,14 +1,14 @@ from contextlib import closing from typing import Iterator from uuid import UUID, uuid4 -import psycopg2 from fixtures.zenith_fixtures import ZenithEnvBuilder, ZenithPageserverApiException +from requests.exceptions import HTTPError import pytest def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() ps = env.pageserver @@ -25,25 +25,31 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder): ps.safe_psql("set FOO", password=tenant_token) ps.safe_psql("set FOO", password=management_token) + new_timeline_id = env.zenith_cli.create_branch('test_pageserver_auth', + tenant_id=env.initial_tenant) + # tenant can create branches - tenant_http_client.branch_create(env.initial_tenant, 'new1', 'main') + tenant_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # console can create branches for tenant - management_http_client.branch_create(env.initial_tenant, 'new2', 'main') + management_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # fail to create branch using token with different tenant_id with pytest.raises(ZenithPageserverApiException, match='Forbidden: Tenant id mismatch. Permission denied'): - invalid_tenant_http_client.branch_create(env.initial_tenant, "new3", "main") + invalid_tenant_http_client.timeline_create(tenant_id=env.initial_tenant, + ancestor_timeline_id=new_timeline_id) # create tenant using management token - management_http_client.tenant_create(uuid4()) + management_http_client.tenant_create() # fail to create tenant using tenant token with pytest.raises( ZenithPageserverApiException, match='Forbidden: Attempt to access management api with tenant scope. Permission denied' ): - tenant_http_client.tenant_create(uuid4()) + tenant_http_client.tenant_create() @pytest.mark.parametrize('with_wal_acceptors', [False, True]) @@ -51,11 +57,10 @@ def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_w zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}" - env.zenith_cli.create_branch(branch, "main") + env = zenith_env_builder.init_start() + branch = f'test_compute_auth_to_pageserver{with_wal_acceptors}' + env.zenith_cli.create_branch(branch) pg = env.postgres.create_start(branch) with closing(pg.connect()) as conn: diff --git a/test_runner/batch_others/test_backpressure.py b/test_runner/batch_others/test_backpressure.py index 23af5b90ed..ff34121327 100644 --- a/test_runner/batch_others/test_backpressure.py +++ b/test_runner/batch_others/test_backpressure.py @@ -93,9 +93,9 @@ def check_backpressure(pg: Postgres, stop_event: threading.Event, polling_interv def test_backpressure_received_lsn_lag(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Create a branch for us - env.zenith_cli.create_branch("test_backpressure", "main") + env.zenith_cli.create_branch('test_backpressure') pg = env.postgres.create_start('test_backpressure', config_lines=['max_replication_write_lag=30MB']) diff --git a/test_runner/batch_others/test_branch_behind.py b/test_runner/batch_others/test_branch_behind.py index 860db51c8a..4e2be352f4 100644 --- a/test_runner/batch_others/test_branch_behind.py +++ b/test_runner/batch_others/test_branch_behind.py @@ -19,11 +19,10 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # # See https://github.com/zenithdb/zenith/issues/1068 zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind", "main") - + env.zenith_cli.create_branch('test_branch_behind') pgmain = env.postgres.create_start('test_branch_behind') log.info("postgres is running on 'test_branch_behind' branch") @@ -60,7 +59,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 200100 rows: {lsn_b}') # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_hundred", "test_branch_behind@" + lsn_a) + env.zenith_cli.create_branch('test_branch_behind_hundred', + 'test_branch_behind', + ancestor_start_lsn=lsn_a) # Insert many more rows. This generates enough WAL to fill a few segments. main_cur.execute(''' @@ -75,10 +76,12 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): log.info(f'LSN after 400100 rows: {lsn_c}') # Branch at the point where only 200100 rows were inserted - env.zenith_cli.create_branch("test_branch_behind_more", "test_branch_behind@" + lsn_b) + env.zenith_cli.create_branch('test_branch_behind_more', + 'test_branch_behind', + ancestor_start_lsn=lsn_b) - pg_hundred = env.postgres.create_start("test_branch_behind_hundred") - pg_more = env.postgres.create_start("test_branch_behind_more") + pg_hundred = env.postgres.create_start('test_branch_behind_hundred') + pg_more = env.postgres.create_start('test_branch_behind_more') # On the 'hundred' branch, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -99,19 +102,23 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): # Check bad lsn's for branching # branch at segment boundary - env.zenith_cli.create_branch("test_branch_segment_boundary", "test_branch_behind@0/3000000") - pg = env.postgres.create_start("test_branch_segment_boundary") + env.zenith_cli.create_branch('test_branch_segment_boundary', + 'test_branch_behind', + ancestor_start_lsn="0/3000000") + pg = env.postgres.create_start('test_branch_segment_boundary') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) # branch at pre-initdb lsn with pytest.raises(Exception, match="invalid branch start lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "main@0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', ancestor_start_lsn="0/42") # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.zenith_cli.create_branch("test_branch_preinitdb", "test_branch_behind@0/42") + env.zenith_cli.create_branch('test_branch_preinitdb', + 'test_branch_behind', + ancestor_start_lsn="0/42") # check that we cannot create branch based on garbage collected data with closing(env.pageserver.connect()) as psconn: @@ -123,7 +130,9 @@ def test_branch_behind(zenith_env_builder: ZenithEnvBuilder): with pytest.raises(Exception, match="invalid branch start lsn"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.zenith_cli.create_branch("test_branch_create_fail", f"test_branch_behind@{gced_lsn}") + env.zenith_cli.create_branch('test_branch_create_fail', + 'test_branch_behind', + ancestor_start_lsn=gced_lsn) # check that after gc everything is still there hundred_cur.execute('SELECT count(*) FROM foo') diff --git a/test_runner/batch_others/test_clog_truncate.py b/test_runner/batch_others/test_clog_truncate.py index 504f455936..b7eeedb23e 100644 --- a/test_runner/batch_others/test_clog_truncate.py +++ b/test_runner/batch_others/test_clog_truncate.py @@ -12,7 +12,7 @@ from fixtures.log_helper import log # def test_clog_truncate(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_clog_truncate", "empty") + env.zenith_cli.create_branch('test_clog_truncate', 'empty') # set agressive autovacuum to make sure that truncation will happen config = [ @@ -62,9 +62,9 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv): # create new branch after clog truncation and start a compute node on it log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}') - env.zenith_cli.create_branch("test_clog_truncate_new", - "test_clog_truncate@" + lsn_after_truncation) - + env.zenith_cli.create_branch('test_clog_truncate_new', + 'test_clog_truncate', + ancestor_start_lsn=lsn_after_truncation) pg2 = env.postgres.create_start('test_clog_truncate_new') log.info('postgres is running on test_clog_truncate_new branch') diff --git a/test_runner/batch_others/test_createdropdb.py b/test_runner/batch_others/test_createdropdb.py index 38243b298b..88937fa0dc 100644 --- a/test_runner/batch_others/test_createdropdb.py +++ b/test_runner/batch_others/test_createdropdb.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log # def test_createdb(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createdb", "empty") + env.zenith_cli.create_branch('test_createdb', 'empty') pg = env.postgres.create_start('test_createdb') log.info("postgres is running on 'test_createdb' branch") @@ -27,8 +27,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createdb2", "test_createdb@" + lsn) - + env.zenith_cli.create_branch('test_createdb2', 'test_createdb', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createdb2') # Test that you can connect to the new database on both branches @@ -41,8 +40,7 @@ def test_createdb(zenith_simple_env: ZenithEnv): # def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_dropdb", "empty") - + env.zenith_cli.create_branch('test_dropdb', 'empty') pg = env.postgres.create_start('test_dropdb') log.info("postgres is running on 'test_dropdb' branch") @@ -66,10 +64,14 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir): lsn_after_drop = cur.fetchone()[0] # Create two branches before and after database drop. - env.zenith_cli.create_branch("test_before_dropdb", "test_dropdb@" + lsn_before_drop) + env.zenith_cli.create_branch('test_before_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_before_drop) pg_before = env.postgres.create_start('test_before_dropdb') - env.zenith_cli.create_branch("test_after_dropdb", "test_dropdb@" + lsn_after_drop) + env.zenith_cli.create_branch('test_after_dropdb', + 'test_dropdb', + ancestor_start_lsn=lsn_after_drop) pg_after = env.postgres.create_start('test_after_dropdb') # Test that database exists on the branch before drop diff --git a/test_runner/batch_others/test_createuser.py b/test_runner/batch_others/test_createuser.py index 1959b47dcc..efb2af3f07 100644 --- a/test_runner/batch_others/test_createuser.py +++ b/test_runner/batch_others/test_createuser.py @@ -9,8 +9,7 @@ from fixtures.log_helper import log # def test_createuser(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_createuser", "empty") - + env.zenith_cli.create_branch('test_createuser', 'empty') pg = env.postgres.create_start('test_createuser') log.info("postgres is running on 'test_createuser' branch") @@ -25,8 +24,7 @@ def test_createuser(zenith_simple_env: ZenithEnv): lsn = cur.fetchone()[0] # Create a branch - env.zenith_cli.create_branch("test_createuser2", "test_createuser@" + lsn) - + env.zenith_cli.create_branch('test_createuser2', 'test_createuser', ancestor_start_lsn=lsn) pg2 = env.postgres.create_start('test_createuser2') # Test that you can connect to new branch as a new user diff --git a/test_runner/batch_others/test_multixact.py b/test_runner/batch_others/test_multixact.py index 6a2afd2ede..7a508a67fb 100644 --- a/test_runner/batch_others/test_multixact.py +++ b/test_runner/batch_others/test_multixact.py @@ -10,7 +10,7 @@ from fixtures.log_helper import log # def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): env = zenith_simple_env - env.zenith_cli.create_branch("test_multixact", "empty") + env.zenith_cli.create_branch('test_multixact', 'empty') pg = env.postgres.create_start('test_multixact') log.info("postgres is running on 'test_multixact' branch") @@ -60,7 +60,7 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.zenith_cli.create_branch("test_multixact_new", "test_multixact@" + lsn) + env.zenith_cli.create_branch('test_multixact_new', 'test_multixact', ancestor_start_lsn=lsn) pg_new = env.postgres.create_start('test_multixact_new') log.info("postgres is running on 'test_multixact_new' branch") diff --git a/test_runner/batch_others/test_next_xid.py b/test_runner/batch_others/test_next_xid.py index 625abc39d3..fd0f761409 100644 --- a/test_runner/batch_others/test_next_xid.py +++ b/test_runner/batch_others/test_next_xid.py @@ -11,7 +11,7 @@ from fixtures.log_helper import log def test_next_xid(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() pg = env.postgres.create_start('main') diff --git a/test_runner/batch_others/test_pageserver_api.py b/test_runner/batch_others/test_pageserver_api.py index eccffc4d69..2aa3686904 100644 --- a/test_runner/batch_others/test_pageserver_api.py +++ b/test_runner/batch_others/test_pageserver_api.py @@ -1,8 +1,15 @@ -import json from uuid import uuid4, UUID -from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient -from typing import cast -import pytest, psycopg2 +import pytest +from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient, zenith_binpath + + +# test that we cannot override node id +def test_pageserver_init_node_id(zenith_env_builder: ZenithEnvBuilder): + env = zenith_env_builder.init() + with pytest.raises( + Exception, + match="node id can only be set during pageserver init and cannot be overridden"): + env.pageserver.start(overrides=['--pageserver-config-override=id=10']) def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): @@ -16,21 +23,25 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: UUID): client.tenant_create(tenant_id) assert tenant_id.hex in {t['id'] for t in client.tenant_list()} - # check its timelines + timelines = client.timeline_list(tenant_id) + assert len(timelines) == 0, "initial tenant should not have any timelines" + + # create timeline + timeline_id = uuid4() + client.timeline_create(tenant_id=tenant_id, new_timeline_id=timeline_id) + timelines = client.timeline_list(tenant_id) assert len(timelines) > 0 - for timeline_id_str in timelines: - timeline_details = client.timeline_detail(tenant_id, UUID(timeline_id_str)) - assert timeline_details['type'] == 'Local' - assert timeline_details['tenant_id'] == tenant_id.hex - assert timeline_details['timeline_id'] == timeline_id_str - - # create branch - branch_name = uuid4().hex - client.branch_create(tenant_id, branch_name, "main") # check it is there - assert branch_name in {b['name'] for b in client.branch_list(tenant_id)} + assert timeline_id.hex in {b['timeline_id'] for b in client.timeline_list(tenant_id)} + for timeline in timelines: + timeline_id_str = str(timeline['timeline_id']) + timeline_details = client.timeline_detail(tenant_id=tenant_id, + timeline_id=UUID(timeline_id_str)) + assert timeline_details['kind'] == 'Local' + assert timeline_details['tenant_id'] == tenant_id.hex + assert timeline_details['timeline_id'] == timeline_id_str def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): @@ -41,7 +52,7 @@ def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv): def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.pageserver_auth_enabled = True - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() management_token = env.auth_keys.generate_management_token() diff --git a/test_runner/batch_others/test_pageserver_catchup.py b/test_runner/batch_others/test_pageserver_catchup.py index 97dc0f3260..7093a1bdb3 100644 --- a/test_runner/batch_others/test_pageserver_catchup.py +++ b/test_runner/batch_others/test_pageserver_catchup.py @@ -14,9 +14,9 @@ from fixtures.log_helper import log # and new compute node contains all data. def test_pageserver_catchup_while_compute_down(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_catchup_while_compute_down", "main") + env.zenith_cli.create_branch('test_pageserver_catchup_while_compute_down') pg = env.postgres.create_start('test_pageserver_catchup_while_compute_down') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_pageserver_restart.py b/test_runner/batch_others/test_pageserver_restart.py index 0cfc50f0ff..57f9db8f96 100644 --- a/test_runner/batch_others/test_pageserver_restart.py +++ b/test_runner/batch_others/test_pageserver_restart.py @@ -13,9 +13,9 @@ from fixtures.log_helper import log def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder): # One safekeeper is enough for this test. zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_pageserver_restart", "main") + env.zenith_cli.create_branch('test_pageserver_restart') pg = env.postgres.create_start('test_pageserver_restart') pg_conn = pg.connect() diff --git a/test_runner/batch_others/test_parallel_copy.py b/test_runner/batch_others/test_parallel_copy.py index 6f87bc4a36..4b7cc58d42 100644 --- a/test_runner/batch_others/test_parallel_copy.py +++ b/test_runner/batch_others/test_parallel_copy.py @@ -1,7 +1,5 @@ from io import BytesIO import asyncio -import asyncpg -import subprocess from fixtures.zenith_fixtures import ZenithEnv, Postgres from fixtures.log_helper import log diff --git a/test_runner/batch_others/test_proxy.py b/test_runner/batch_others/test_proxy.py index 9510e880b2..d2039f9758 100644 --- a/test_runner/batch_others/test_proxy.py +++ b/test_runner/batch_others/test_proxy.py @@ -1,2 +1,15 @@ +import pytest + + def test_proxy_select_1(static_proxy): static_proxy.safe_psql("select 1;") + + +@pytest.mark.xfail # Proxy eats the extra connection options +def test_proxy_options(static_proxy): + schema_name = "tmp_schema_1" + with static_proxy.connect(schema=schema_name) as conn: + with conn.cursor() as cur: + cur.execute("SHOW search_path;") + search_path = cur.fetchall()[0][0] + assert schema_name == search_path diff --git a/test_runner/batch_others/test_readonly_node.py b/test_runner/batch_others/test_readonly_node.py index ba256e71f7..808ee62def 100644 --- a/test_runner/batch_others/test_readonly_node.py +++ b/test_runner/batch_others/test_readonly_node.py @@ -11,8 +11,7 @@ from fixtures.zenith_fixtures import ZenithEnv # def test_readonly_node(zenith_simple_env: ZenithEnv): env = zenith_simple_env - env.zenith_cli.create_branch("test_readonly_node", "empty") - + env.zenith_cli.create_branch('test_readonly_node', 'empty') pgmain = env.postgres.create_start('test_readonly_node') log.info("postgres is running on 'test_readonly_node' branch") @@ -53,12 +52,14 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): log.info('LSN after 400100 rows: ' + lsn_c) # Create first read-only node at the point where only 100 rows were inserted - pg_hundred = env.postgres.create_start("test_readonly_node_hundred", - branch=f'test_readonly_node@{lsn_a}') + pg_hundred = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_hundred', + lsn=lsn_a) # And another at the point where 200100 rows were inserted - pg_more = env.postgres.create_start("test_readonly_node_more", - branch=f'test_readonly_node@{lsn_b}') + pg_more = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_more', + lsn=lsn_b) # On the 'hundred' node, we should see only 100 rows hundred_pg_conn = pg_hundred.connect() @@ -77,8 +78,9 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): assert main_cur.fetchone() == (400100, ) # Check creating a node at segment boundary - pg = env.postgres.create_start("test_branch_segment_boundary", - branch="test_readonly_node@0/3000000") + pg = env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_branch_segment_boundary', + lsn='0/3000000') cur = pg.connect().cursor() cur.execute('SELECT 1') assert cur.fetchone() == (1, ) @@ -86,5 +88,6 @@ def test_readonly_node(zenith_simple_env: ZenithEnv): # Create node at pre-initdb lsn with pytest.raises(Exception, match="invalid basebackup lsn"): # compute node startup with invalid LSN should fail - env.zenith_cli.pg_start("test_readonly_node_preinitdb", - timeline_spec="test_readonly_node@0/42") + env.postgres.create_start(branch_name='test_readonly_node', + node_name='test_readonly_node_preinitdb', + lsn='0/42') diff --git a/test_runner/batch_others/test_remote_storage.py b/test_runner/batch_others/test_remote_storage.py index fa6feaf412..edcc768819 100644 --- a/test_runner/batch_others/test_remote_storage.py +++ b/test_runner/batch_others/test_remote_storage.py @@ -42,8 +42,8 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, data_secret = 'very secret secret' ##### First start, insert secret data and upload it to the remote storage - env = zenith_env_builder.init() - pg = env.postgres.create_start() + env = zenith_env_builder.init_start() + pg = env.postgres.create_start('main') tenant_id = pg.safe_psql("show zenith.zenith_tenant")[0][0] timeline_id = pg.safe_psql("show zenith.zenith_timeline")[0][0] @@ -85,7 +85,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, timeline_details = client.timeline_detail(UUID(tenant_id), UUID(timeline_id)) assert timeline_details['timeline_id'] == timeline_id assert timeline_details['tenant_id'] == tenant_id - if timeline_details['type'] == 'Local': + if timeline_details['kind'] == 'Local': log.info("timeline downloaded, checking its data") break attempts += 1 @@ -94,7 +94,7 @@ def test_remote_storage_backup_and_restore(zenith_env_builder: ZenithEnvBuilder, log.debug("still waiting") time.sleep(1) - pg = env.postgres.create_start() + pg = env.postgres.create_start('main') with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute(f'SELECT secret FROM t1 WHERE id = {data_id};') diff --git a/test_runner/batch_others/test_restart_compute.py b/test_runner/batch_others/test_restart_compute.py index f7810be555..fd06561c00 100644 --- a/test_runner/batch_others/test_restart_compute.py +++ b/test_runner/batch_others/test_restart_compute.py @@ -13,10 +13,9 @@ def test_restart_compute(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptor zenith_env_builder.pageserver_auth_enabled = True if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_restart_compute", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_restart_compute') pg = env.postgres.create_start('test_restart_compute') log.info("postgres is running on 'test_restart_compute' branch") diff --git a/test_runner/batch_others/test_tenant_relocation.py b/test_runner/batch_others/test_tenant_relocation.py index 5c6d78e730..7a9d478f16 100644 --- a/test_runner/batch_others/test_tenant_relocation.py +++ b/test_runner/batch_others/test_tenant_relocation.py @@ -122,21 +122,19 @@ def test_tenant_relocation(zenith_env_builder: ZenithEnvBuilder, zenith_env_builder.num_safekeepers = 1 zenith_env_builder.enable_local_fs_remote_storage() - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # create folder for remote storage mock remote_storage_mock_path = env.repo_dir / 'local_fs_remote_storage' - tenant = env.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) + tenant = env.zenith_cli.create_tenant(UUID("74ee8b079a0e437eb0afea7d26a07209")) log.info("tenant to relocate %s", tenant) - env.zenith_cli.create_branch("test_tenant_relocation", "main", tenant_id=tenant) + env.zenith_cli.create_branch('test_tenant_relocation', tenant_id=tenant) - tenant_pg = env.postgres.create_start( - "test_tenant_relocation", - "main", # branch name, None means same as node name - tenant_id=tenant, - ) + tenant_pg = env.postgres.create_start(branch_name='main', + node_name='test_tenant_relocation', + tenant_id=tenant) # insert some data with closing(tenant_pg.connect()) as conn: diff --git a/test_runner/batch_others/test_tenants.py b/test_runner/batch_others/test_tenants.py index 232c724870..e883018628 100644 --- a/test_runner/batch_others/test_tenants.py +++ b/test_runner/batch_others/test_tenants.py @@ -10,27 +10,23 @@ def test_tenants_normal_work(zenith_env_builder: ZenithEnvBuilder, with_wal_acce if with_wal_acceptors: zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1 = env.create_tenant() - tenant_2 = env.create_tenant() + tenant_1 = env.zenith_cli.create_tenant() + tenant_2 = env.zenith_cli.create_tenant() - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_1) - env.zenith_cli.create_branch(f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - "main", - tenant_id=tenant_2) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_1) + env.zenith_cli.create_timeline( + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', tenant_id=tenant_2) pg_tenant1 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_1, + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_1, ) pg_tenant2 = env.postgres.create_start( - f"test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}", - None, # branch name, None means same as node name - tenant_2, + f'test_tenants_normal_work_with_wal_acceptors{with_wal_acceptors}', + tenant_id=tenant_2, ) for pg in [pg_tenant1, pg_tenant2]: diff --git a/test_runner/batch_others/test_timeline_size.py b/test_runner/batch_others/test_timeline_size.py index b48f830528..7d8ab551b0 100644 --- a/test_runner/batch_others/test_timeline_size.py +++ b/test_runner/batch_others/test_timeline_size.py @@ -10,10 +10,10 @@ import time def test_timeline_size(zenith_simple_env: ZenithEnv): env = zenith_simple_env # Branch at the point where only 100 rows were inserted - env.zenith_cli.create_branch("test_timeline_size", "empty") + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size', 'empty') client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = env.postgres.create_start("test_timeline_size") @@ -31,11 +31,11 @@ def test_timeline_size(zenith_simple_env: ZenithEnv): FROM generate_series(1, 10) g """) - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] cur.execute("TRUNCATE foo") - res = client.branch_detail(env.initial_tenant, "test_timeline_size") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] @@ -67,18 +67,17 @@ def wait_for_pageserver_catchup(pgmain: Postgres, polling_interval=1, timeout=60 def test_timeline_size_quota(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_timeline_size_quota", "main") + env = zenith_env_builder.init_start() + new_timeline_id = env.zenith_cli.create_branch('test_timeline_size_quota') client = env.pageserver.http_client() - res = client.branch_detail(env.initial_tenant, "test_timeline_size_quota") + res = client.timeline_detail(tenant_id=env.initial_tenant, timeline_id=new_timeline_id) assert res["current_logical_size"] == res["current_logical_size_non_incremental"] pgmain = env.postgres.create_start( "test_timeline_size_quota", # Set small limit for the test - config_lines=['zenith.max_cluster_size=30MB'], - ) + config_lines=['zenith.max_cluster_size=30MB']) log.info("postgres is running on 'test_timeline_size_quota' branch") with closing(pgmain.connect()) as conn: diff --git a/test_runner/batch_others/test_twophase.py b/test_runner/batch_others/test_twophase.py index d6a1cd01e8..4afdc7e0be 100644 --- a/test_runner/batch_others/test_twophase.py +++ b/test_runner/batch_others/test_twophase.py @@ -10,7 +10,6 @@ from fixtures.log_helper import log def test_twophase(zenith_simple_env: ZenithEnv): env = zenith_simple_env env.zenith_cli.create_branch("test_twophase", "empty") - pg = env.postgres.create_start('test_twophase', config_lines=['max_prepared_transactions=5']) log.info("postgres is running on 'test_twophase' branch") diff --git a/test_runner/batch_others/test_wal_acceptor.py b/test_runner/batch_others/test_wal_acceptor.py index 4d9e18bb58..bdc4c4f63c 100644 --- a/test_runner/batch_others/test_wal_acceptor.py +++ b/test_runner/batch_others/test_wal_acceptor.py @@ -13,7 +13,7 @@ from dataclasses import dataclass, field from multiprocessing import Process, Value from pathlib import Path from fixtures.zenith_fixtures import PgBin, Postgres, Safekeeper, ZenithEnv, ZenithEnvBuilder, PortDistributor, SafekeeperPort, zenith_binpath, PgProtocol -from fixtures.utils import lsn_to_hex, mkdir_if_needed +from fixtures.utils import lsn_to_hex, mkdir_if_needed, lsn_from_hex from fixtures.log_helper import log from typing import List, Optional, Any @@ -22,10 +22,9 @@ from typing import List, Optional, Any # succeed and data is written def test_normal_work(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() - - env.zenith_cli.create_branch("test_wal_acceptors_normal_work", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_wal_acceptors_normal_work') pg = env.postgres.create_start('test_wal_acceptors_normal_work') with closing(pg.connect()) as conn: @@ -39,9 +38,9 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder): @dataclass -class BranchMetrics: - name: str - latest_valid_lsn: int +class TimelineMetrics: + timeline_id: str + last_record_lsn: int # One entry per each Safekeeper, order is the same flush_lsns: List[int] = field(default_factory=list) commit_lsns: List[int] = field(default_factory=list) @@ -51,27 +50,36 @@ class BranchMetrics: # against different timelines. def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() n_timelines = 3 - branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)] + branch_names = [ + "test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines) + ] + # pageserver, safekeeper operate timelines via their ids (can be represented in hex as 'ad50847381e248feaac9876cc71ae418') + # that's not really human readable, so the branch names are introduced in Zenith CLI. + # Zenith CLI stores its branch <-> timeline mapping in its internals, + # but we need this to collect metrics from other servers, related to the timeline. + branch_names_to_timeline_ids = {} # start postgres on each timeline pgs = [] - for branch in branches: - env.zenith_cli.create_branch(branch, "main") - pgs.append(env.postgres.create_start(branch)) + for branch_name in branch_names: + new_timeline_id = env.zenith_cli.create_branch(branch_name) + pgs.append(env.postgres.create_start(branch_name)) + branch_names_to_timeline_ids[branch_name] = new_timeline_id tenant_id = env.initial_tenant - def collect_metrics(message: str) -> List[BranchMetrics]: + def collect_metrics(message: str) -> List[TimelineMetrics]: with env.pageserver.http_client() as pageserver_http: - branch_details = [ - pageserver_http.branch_detail(tenant_id=tenant_id, name=branch) - for branch in branches + timeline_details = [ + pageserver_http.timeline_detail( + tenant_id=tenant_id, timeline_id=branch_names_to_timeline_ids[branch_name]) + for branch_name in branch_names ] - # All changes visible to pageserver (latest_valid_lsn) should be + # All changes visible to pageserver (last_record_lsn) should be # confirmed by safekeepers first. As we cannot atomically get # state of both pageserver and safekeepers, we should start with # pageserver. Looking at outdated data from pageserver is ok. @@ -80,14 +88,14 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # safekeepers' state, it will look contradictory. sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers] - branch_metrics = [] + timeline_metrics = [] with env.pageserver.http_client() as pageserver_http: - for branch_detail in branch_details: - timeline_id: str = branch_detail["timeline_id"] + for timeline_detail in timeline_details: + timeline_id: str = timeline_detail["timeline_id"] - m = BranchMetrics( - name=branch_detail["name"], - latest_valid_lsn=branch_detail["latest_valid_lsn"], + m = TimelineMetrics( + timeline_id=timeline_id, + last_record_lsn=lsn_from_hex(timeline_detail["last_record_lsn"]), ) for sk_m in sk_metrics: m.flush_lsns.append(sk_m.flush_lsn_inexact[(tenant_id.hex, timeline_id)]) @@ -99,13 +107,13 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): # We only call collect_metrics() after a transaction is confirmed by # the compute node, which only happens after a consensus of safekeepers # has confirmed the transaction. We assume majority consensus here. - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers) - assert (2 * sum(m.latest_valid_lsn <= lsn + assert (2 * sum(m.last_record_lsn <= lsn for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers) - branch_metrics.append(m) - log.info(f"{message}: {branch_metrics}") - return branch_metrics + timeline_metrics.append(m) + log.info(f"{message}: {timeline_metrics}") + return timeline_metrics # TODO: https://github.com/zenithdb/zenith/issues/809 # collect_metrics("before CREATE TABLE") @@ -117,7 +125,7 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): pg.safe_psql("CREATE TABLE t(key int primary key, value text)") init_m = collect_metrics("after CREATE TABLE") - # Populate data for 2/3 branches + # Populate data for 2/3 timelines class MetricsChecker(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) @@ -155,15 +163,15 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder): collect_metrics("after INSERT INTO") - # Check data for 2/3 branches + # Check data for 2/3 timelines for pg in pgs[:-1]: res = pg.safe_psql("SELECT sum(key) FROM t") assert res[0] == (5000050000, ) final_m = collect_metrics("after SELECT") - # Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly. + # Assume that LSNs (a) behave similarly in all timelines; and (b) INSERT INTO alters LSN significantly. # Also assume that safekeepers will not be significantly out of sync in this test. - middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2 + middle_lsn = (init_m[0].last_record_lsn + final_m[0].last_record_lsn) // 2 assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns) assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns) assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns) @@ -181,9 +189,9 @@ def test_restarts(zenith_env_builder: ZenithEnvBuilder): n_acceptors = 3 zenith_env_builder.num_safekeepers = n_acceptors - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts", "main") + env.zenith_cli.create_branch('test_wal_acceptors_restarts') pg = env.postgres.create_start('test_wal_acceptors_restarts') # we rely upon autocommit after each statement @@ -218,9 +226,9 @@ def delayed_wal_acceptor_start(wa): # When majority of acceptors is offline, commits are expected to be frozen def test_unavailability(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 2 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_unavailability", "main") + env.zenith_cli.create_branch('test_wal_acceptors_unavailability') pg = env.postgres.create_start('test_wal_acceptors_unavailability') # we rely upon autocommit after each statement @@ -289,9 +297,9 @@ def stop_value(): def test_race_conditions(zenith_env_builder: ZenithEnvBuilder, stop_value): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_race_conditions", "main") + env.zenith_cli.create_branch('test_wal_acceptors_race_conditions') pg = env.postgres.create_start('test_wal_acceptors_race_conditions') # we rely upon autocommit after each statement @@ -404,7 +412,7 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, # We don't really need the full environment for this test, just the # safekeepers would be enough. zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() timeline_id = uuid.uuid4() tenant_id = uuid.uuid4() @@ -454,9 +462,9 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, def test_timeline_status(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_timeline_status", "main") + env.zenith_cli.create_branch('test_timeline_status') pg = env.postgres.create_start('test_timeline_status') wa = env.safekeepers[0] @@ -521,12 +529,7 @@ class SafekeeperEnv: http=self.port_distributor.get_port(), ) - if self.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - - safekeeper_dir = os.path.join(self.repo_dir, name) + safekeeper_dir = os.path.join(self.repo_dir, f"sk{i}") mkdir_if_needed(safekeeper_dir) args = [ @@ -537,6 +540,8 @@ class SafekeeperEnv: f"127.0.0.1:{port.http}", "-D", safekeeper_dir, + "--id", + str(i), "--daemonize" ] @@ -604,9 +609,8 @@ def test_safekeeper_without_pageserver(test_output_dir: str, def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): - def safekeepers_guc(env: ZenithEnv, sk_names: List[str]) -> str: - return ','.join( - [f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.name in sk_names]) + def safekeepers_guc(env: ZenithEnv, sk_names: List[int]) -> str: + return ','.join([f'localhost:{sk.port.pg}' for sk in env.safekeepers if sk.id in sk_names]) def execute_payload(pg: Postgres): with closing(pg.connect()) as conn: @@ -628,17 +632,17 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): http_cli = sk.http_client() try: status = http_cli.timeline_status(tenant_id, timeline_id) - log.info(f"Safekeeper {sk.name} status: {status}") + log.info(f"Safekeeper {sk.id} status: {status}") except Exception as e: - log.info(f"Safekeeper {sk.name} status error: {e}") + log.info(f"Safekeeper {sk.id} status error: {e}") zenith_env_builder.num_safekeepers = 4 - env = zenith_env_builder.init() - env.zenith_cli.create_branch("test_replace_safekeeper", "main") + env = zenith_env_builder.init_start() + env.zenith_cli.create_branch('test_replace_safekeeper') log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() - active_safekeepers = ['sk1', 'sk2', 'sk3'] + active_safekeepers = [1, 2, 3] pg = env.postgres.create('test_replace_safekeeper') pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() @@ -678,7 +682,7 @@ def test_replace_safekeeper(zenith_env_builder: ZenithEnvBuilder): log.info("Recreate postgres to replace failed sk1 with new sk4") pg.stop_and_destroy().create('test_replace_safekeeper') - active_safekeepers = ['sk2', 'sk3', 'sk4'] + active_safekeepers = [2, 3, 4] env.safekeepers[3].start() pg.adjust_for_wal_acceptors(safekeepers_guc(env, active_safekeepers)) pg.start() diff --git a/test_runner/batch_others/test_wal_acceptor_async.py b/test_runner/batch_others/test_wal_acceptor_async.py index 1d2a186eb7..31ace7eab3 100644 --- a/test_runner/batch_others/test_wal_acceptor_async.py +++ b/test_runner/batch_others/test_wal_acceptor_async.py @@ -200,9 +200,9 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w # restart acceptors one by one, while executing and validating bank transactions def test_restarts_under_load(zenith_env_builder: ZenithEnvBuilder): zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() - env.zenith_cli.create_branch("test_wal_acceptors_restarts_under_load", "main") + env.zenith_cli.create_branch('test_wal_acceptors_restarts_under_load') pg = env.postgres.create_start('test_wal_acceptors_restarts_under_load') asyncio.run(run_restarts_under_load(pg, env.safekeepers)) diff --git a/test_runner/batch_others/test_zenith_cli.py b/test_runner/batch_others/test_zenith_cli.py index ce051dfd6e..4a62a1430a 100644 --- a/test_runner/batch_others/test_zenith_cli.py +++ b/test_runner/batch_others/test_zenith_cli.py @@ -7,52 +7,46 @@ from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserv from typing import cast -def helper_compare_branch_list(pageserver_http_client: ZenithPageserverHttpClient, - env: ZenithEnv, - initial_tenant: uuid.UUID): +def helper_compare_timeline_list(pageserver_http_client: ZenithPageserverHttpClient, + env: ZenithEnv, + initial_tenant: uuid.UUID): """ - Compare branches list returned by CLI and directly via API. - Filters out branches created by other tests. + Compare timelines list returned by CLI and directly via API. + Filters out timelines created by other tests. """ - branches = pageserver_http_client.branch_list(initial_tenant) - branches_api = sorted(map(lambda b: cast(str, b['name']), branches)) - branches_api = [b for b in branches_api if b.startswith('test_cli_') or b in ('empty', 'main')] - res = env.zenith_cli.list_branches() - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli = [b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main')] + timelines_api = sorted( + map(lambda t: cast(str, t['timeline_id']), + pageserver_http_client.timeline_list(initial_tenant))) - res = env.zenith_cli.list_branches(tenant_id=initial_tenant) - branches_cli_with_tenant_arg = sorted( - map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) - branches_cli_with_tenant_arg = [ - b for b in branches_cli if b.startswith('test_cli_') or b in ('empty', 'main') - ] + timelines_cli = env.zenith_cli.list_timelines() + assert timelines_cli == env.zenith_cli.list_timelines(initial_tenant) - assert branches_api == branches_cli == branches_cli_with_tenant_arg + cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) + assert timelines_api == cli_timeline_ids -def test_cli_branch_list(zenith_simple_env: ZenithEnv): +def test_cli_timeline_list(zenith_simple_env: ZenithEnv): env = zenith_simple_env pageserver_http_client = env.pageserver.http_client() # Initial sanity check - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) - env.zenith_cli.create_branch("test_cli_branch_list_main", "empty") - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) + + # Create a branch for us + main_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_main') + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - res = env.zenith_cli.create_branch("test_cli_branch_list_nested", "test_cli_branch_list_main") - assert res.stderr == '' - helper_compare_branch_list(pageserver_http_client, env, env.initial_tenant) + nested_timeline_id = env.zenith_cli.create_branch('test_cli_branch_list_nested', + 'test_cli_branch_list_main') + helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - res = env.zenith_cli.list_branches() - assert res.stderr == '' - branches_cli = sorted(map(lambda b: b.split(':')[-1].strip(), res.stdout.strip().split("\n"))) + timelines_cli = [timeline_id for (_, timeline_id) in env.zenith_cli.list_timelines()] - assert 'test_cli_branch_list_main' in branches_cli - assert 'test_cli_branch_list_nested' in branches_cli + assert main_timeline_id.hex in timelines_cli + assert nested_timeline_id.hex in timelines_cli def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClient, env: ZenithEnv): @@ -60,7 +54,6 @@ def helper_compare_tenant_list(pageserver_http_client: ZenithPageserverHttpClien tenants_api = sorted(map(lambda t: cast(str, t['id']), tenants)) res = env.zenith_cli.list_tenants() - assert res.stderr == '' tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli @@ -73,15 +66,13 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant1) + tenant1 = env.zenith_cli.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2 = uuid.uuid4() - env.zenith_cli.create_tenant(tenant2) + tenant2 = env.zenith_cli.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) @@ -97,7 +88,7 @@ def test_cli_tenant_list(zenith_simple_env: ZenithEnv): def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Connect to sk port on v4 loopback res = requests.get(f'http://127.0.0.1:{env.safekeepers[0].port.http}/v1/status') @@ -114,7 +105,7 @@ def test_cli_ipv4_listeners(zenith_env_builder: ZenithEnvBuilder): def test_cli_start_stop(zenith_env_builder: ZenithEnvBuilder): # Start with single sk zenith_env_builder.num_safekeepers = 1 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() # Stop default ps/sk env.zenith_cli.pageserver_stop() diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 570c787184..750b02c894 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -64,9 +64,8 @@ class ZenithCompare(PgCompare): self._pg_bin = pg_bin # We only use one branch and one timeline - self.branch = branch_name - self.env.zenith_cli.create_branch(self.branch, "empty") - self._pg = self.env.postgres.create_start(self.branch) + self.env.zenith_cli.create_branch(branch_name, 'empty') + self._pg = self.env.postgres.create_start(branch_name) self.timeline = self.pg.safe_psql("SHOW zenith.zenith_timeline")[0][0] # Long-lived cursor, useful for flushing diff --git a/test_runner/fixtures/zenith_fixtures.py b/test_runner/fixtures/zenith_fixtures.py index b4b3de1db3..ec570a7dac 100644 --- a/test_runner/fixtures/zenith_fixtures.py +++ b/test_runner/fixtures/zenith_fixtures.py @@ -1,6 +1,6 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import field import textwrap from cached_property import cached_property import asyncpg @@ -27,9 +27,8 @@ from dataclasses import dataclass # Type-related stuff from psycopg2.extensions import connection as PgConnection -from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast, Union, Tuple +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, TypeVar, cast, Union, Tuple from typing_extensions import Literal -import pytest import requests import backoff # type: ignore @@ -58,6 +57,7 @@ Fn = TypeVar('Fn', bound=Callable[..., Any]) DEFAULT_OUTPUT_DIR = 'test_output' DEFAULT_POSTGRES_DIR = 'tmp_install' +DEFAULT_BRANCH_NAME = 'main' BASE_PORT = 15000 WORKER_PORT_NUM = 100 @@ -219,7 +219,7 @@ def can_bind(host: str, port: int) -> bool: class PortDistributor: - def __init__(self, base_port: int, port_number: int) -> None: + def __init__(self, base_port: int, port_number: int): self.iterator = iter(range(base_port, base_port + port_number)) def get_port(self) -> int: @@ -242,15 +242,20 @@ class PgProtocol: host: str, port: int, username: Optional[str] = None, - password: Optional[str] = None): + password: Optional[str] = None, + dbname: Optional[str] = None, + schema: Optional[str] = None): self.host = host self.port = port self.username = username self.password = password + self.dbname = dbname + self.schema = schema def connstr(self, *, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> str: """ @@ -259,6 +264,8 @@ class PgProtocol: username = username or self.username password = password or self.password + dbname = dbname or self.dbname or "postgres" + schema = schema or self.schema res = f'host={self.host} port={self.port} dbname={dbname}' if username: @@ -267,13 +274,17 @@ class PgProtocol: if password: res = f'{res} password={password}' + if schema: + res = f"{res} options='-c search_path={schema}'" + return res # autocommit=True here by default because that's what we need most of the time def connect(self, *, autocommit=True, - dbname: str = 'postgres', + dbname: Optional[str] = None, + schema: Optional[str] = None, username: Optional[str] = None, password: Optional[str] = None) -> PgConnection: """ @@ -282,11 +293,13 @@ class PgProtocol: This method passes all extra params to connstr. """ - conn = psycopg2.connect(self.connstr( - dbname=dbname, - username=username, - password=password, - )) + conn = psycopg2.connect( + self.connstr( + dbname=dbname, + schema=schema, + username=username, + password=password, + )) # WARNING: this setting affects *all* tests! conn.autocommit = autocommit return conn @@ -411,7 +424,8 @@ class ZenithEnvBuilder: pageserver_config_override: Optional[str] = None, num_safekeepers: int = 0, pageserver_auth_enabled: bool = False, - rust_log_override: Optional[str] = None): + rust_log_override: Optional[str] = None, + default_branch_name=DEFAULT_BRANCH_NAME): self.repo_dir = repo_dir self.rust_log_override = rust_log_override self.port_distributor = port_distributor @@ -419,6 +433,7 @@ class ZenithEnvBuilder: self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.pageserver_auth_enabled = pageserver_auth_enabled + self.default_branch_name = default_branch_name self.env: Optional[ZenithEnv] = None self.s3_mock_server: Optional[MockS3Server] = None @@ -434,6 +449,14 @@ class ZenithEnvBuilder: self.env = ZenithEnv(self) return self.env + def start(self): + self.env.start() + + def init_start(self) -> ZenithEnv: + env = self.init() + self.start() + return env + """ Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path. Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. @@ -515,7 +538,7 @@ class ZenithEnv: initial_tenant - tenant ID of the initial tenant created in the repository - zenith_cli() - zenith_cli() can be used to run the 'zenith' CLI tool + zenith_cli - can be used to run the 'zenith' CLI tool create_tenant() - initializes a new tenant in the page server, returns the tenant id @@ -526,9 +549,7 @@ class ZenithEnv: self.port_distributor = config.port_distributor self.s3_mock_server = config.s3_mock_server self.zenith_cli = ZenithCli(env=self) - self.postgres = PostgresFactory(self) - self.safekeepers: List[Safekeeper] = [] # generate initial tenant ID here instead of letting 'zenith init' generate it, @@ -537,7 +558,7 @@ class ZenithEnv: # Create a config file corresponding to the options toml = textwrap.dedent(f""" - default_tenantid = '{self.initial_tenant.hex}' + default_tenant_id = '{self.initial_tenant.hex}' """) # Create config for pageserver @@ -549,6 +570,7 @@ class ZenithEnv: toml += textwrap.dedent(f""" [pageserver] + id=1 listen_pg_addr = 'localhost:{pageserver_port.pg}' listen_http_addr = 'localhost:{pageserver_port.http}' auth_type = '{pageserver_auth_type}' @@ -566,25 +588,21 @@ class ZenithEnv: pg=self.port_distributor.get_port(), http=self.port_distributor.get_port(), ) - - if config.num_safekeepers == 1: - name = "single" - else: - name = f"sk{i}" - toml += f""" -[[safekeepers]] -name = '{name}' -pg_port = {port.pg} -http_port = {port.http} -sync = false # Disable fsyncs to make the tests go faster - """ - safekeeper = Safekeeper(env=self, name=name, port=port) + id = i # assign ids sequentially + toml += textwrap.dedent(f""" + [[safekeepers]] + id = {id} + pg_port = {port.pg} + http_port = {port.http} + sync = false # Disable fsyncs to make the tests go faster + """) + safekeeper = Safekeeper(env=self, id=id, port=port) self.safekeepers.append(safekeeper) log.info(f"Config: {toml}") - self.zenith_cli.init(toml) + def start(self): # Start up the page server and all the safekeepers self.pageserver.start() @@ -595,12 +613,6 @@ sync = false # Disable fsyncs to make the tests go faster """ Get list of safekeeper endpoints suitable for wal_acceptors GUC """ return ','.join([f'localhost:{wa.port.pg}' for wa in self.safekeepers]) - def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: - if tenant_id is None: - tenant_id = uuid.uuid4() - self.zenith_cli.create_tenant(tenant_id) - return tenant_id - @cached_property def auth_keys(self) -> AuthKeys: pub = (Path(self.repo_dir) / 'auth_public_key.pem').read_bytes() @@ -624,13 +636,11 @@ def _shared_simple_env(request: Any, port_distributor) -> Iterator[ZenithEnv]: shutil.rmtree(repo_dir, ignore_errors=True) with ZenithEnvBuilder(Path(repo_dir), port_distributor) as builder: - - env = builder.init() + env = builder.init_start() # For convenience in tests, create a branch from the freshly-initialized cluster. - env.zenith_cli.create_branch("empty", "main") + env.zenith_cli.create_branch('empty', ancestor_branch_name=DEFAULT_BRANCH_NAME) - # Return the builder to the caller yield env @@ -659,7 +669,7 @@ def zenith_env_builder(test_output_dir, port_distributor) -> Iterator[ZenithEnvB To use, define 'zenith_env_builder' fixture in your test to get access to the builder object. Set properties on it to describe the environment. Finally, initialize and start up the environment by calling - zenith_env_builder.init(). + zenith_env_builder.init_start(). After the initialization, you can launch compute nodes by calling the functions in the 'env.postgres' factory object, stop/start the @@ -679,7 +689,7 @@ class ZenithPageserverApiException(Exception): class ZenithPageserverHttpClient(requests.Session): - def __init__(self, port: int, auth_token: Optional[str] = None) -> None: + def __init__(self, port: int, auth_token: Optional[str] = None): super().__init__() self.port = port self.auth_token = auth_token @@ -702,38 +712,36 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_attach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/attach", ) + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/attach", + ) self.verbose_error(res) def timeline_detach(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID): res = self.post( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}/detach", ) - self.verbose_error(res) - - def branch_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: - res = self.get(f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}") - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, list) - return res_json - - def branch_create(self, tenant_id: uuid.UUID, name: str, start_point: str) -> Dict[Any, Any]: - res = self.post(f"http://localhost:{self.port}/v1/branch", - json={ - 'tenant_id': tenant_id.hex, - 'name': name, - 'start_point': start_point, - }) - self.verbose_error(res) - res_json = res.json() - assert isinstance(res_json, dict) - return res_json - - def branch_detail(self, tenant_id: uuid.UUID, name: str) -> Dict[Any, Any]: - res = self.get( - f"http://localhost:{self.port}/v1/branch/{tenant_id.hex}/{name}?include-non-incremental-logical-size=1", + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}/detach", ) self.verbose_error(res) + + def timeline_create( + self, + tenant_id: uuid.UUID, + new_timeline_id: Optional[uuid.UUID] = None, + ancestor_timeline_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None, + ) -> Dict[Any, Any]: + res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline", + json={ + 'new_timeline_id': + new_timeline_id.hex if new_timeline_id else None, + 'ancestor_start_lsn': + ancestor_start_lsn, + 'ancestor_timeline_id': + ancestor_timeline_id.hex if ancestor_timeline_id else None, + }) + self.verbose_error(res) + if res.status_code == 409: + raise Exception(f'could not create timeline: already exists for id {new_timeline_id}') + res_json = res.json() assert isinstance(res_json, dict) return res_json @@ -745,18 +753,22 @@ class ZenithPageserverHttpClient(requests.Session): assert isinstance(res_json, list) return res_json - def tenant_create(self, tenant_id: uuid.UUID): + def tenant_create(self, new_tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: res = self.post( f"http://localhost:{self.port}/v1/tenant", json={ - 'tenant_id': tenant_id.hex, + 'new_tenant_id': new_tenant_id.hex if new_tenant_id else None, }, ) self.verbose_error(res) - return res.json() + if res.status_code == 409: + raise Exception(f'could not create tenant: already exists for id {new_tenant_id}') + new_tenant_id = res.json() + assert isinstance(new_tenant_id, str) + return uuid.UUID(new_tenant_id) - def timeline_list(self, tenant_id: uuid.UUID) -> List[str]: - res = self.get(f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}") + def timeline_list(self, tenant_id: uuid.UUID) -> List[Dict[Any, Any]]: + res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline") self.verbose_error(res) res_json = res.json() assert isinstance(res_json, list) @@ -764,7 +776,8 @@ class ZenithPageserverHttpClient(requests.Session): def timeline_detail(self, tenant_id: uuid.UUID, timeline_id: uuid.UUID) -> Dict[Any, Any]: res = self.get( - f"http://localhost:{self.port}/v1/timeline/{tenant_id.hex}/{timeline_id.hex}") + f"http://localhost:{self.port}/v1/tenant/{tenant_id.hex}/timeline/{timeline_id.hex}?include-non-incremental-logical-size=1" + ) self.verbose_error(res) res_json = res.json() assert isinstance(res_json, dict) @@ -798,57 +811,127 @@ class S3Storage: RemoteStorage = Union[LocalFsStorage, S3Storage] +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +CREATE_TIMELINE_ID_EXTRACTOR = re.compile(r"^Created timeline '(?P[^']+)'", + re.MULTILINE) +TIMELINE_DATA_EXTRACTOR = re.compile(r"\s(?P[^\s]+)\s\[(?P[^\]]+)\]", + re.MULTILINE) + class ZenithCli: """ A typed wrapper around the `zenith` CLI tool. Supports main commands via typed methods and a way to run arbitrary command directly via CLI. """ - def __init__(self, env: ZenithEnv) -> None: + def __init__(self, env: ZenithEnv): self.env = env pass def create_tenant(self, tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ if tenant_id is None: tenant_id = uuid.uuid4() - self.raw_cli(['tenant', 'create', tenant_id.hex]) + res = self.raw_cli(['tenant', 'create', '--tenant-id', tenant_id.hex]) + res.check_returncode() return tenant_id def list_tenants(self) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['tenant', 'list']) + res = self.raw_cli(['tenant', 'list']) + res.check_returncode() + return res + + def create_timeline(self, + new_branch_name: str, + tenant_id: Optional[uuid.UUID] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'create', + '--branch-name', + new_branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) + + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + return uuid.UUID(created_timeline_id) def create_branch(self, - branch_name: str, - starting_point: str, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - args.extend([branch_name, starting_point]) + new_branch_name: str = DEFAULT_BRANCH_NAME, + ancestor_branch_name: Optional[str] = None, + tenant_id: Optional[uuid.UUID] = None, + ancestor_start_lsn: Optional[str] = None) -> uuid.UUID: + cmd = [ + 'timeline', + 'branch', + '--branch-name', + new_branch_name, + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + if ancestor_branch_name is not None: + cmd.extend(['--ancestor-branch-name', ancestor_branch_name]) + if ancestor_start_lsn is not None: + cmd.extend(['--ancestor-start-lsn', ancestor_start_lsn]) - return self.raw_cli(args) + res = self.raw_cli(cmd) + res.check_returncode() - def list_branches(self, - tenant_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': - args = ['branch'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) - return self.raw_cli(args) + matches = CREATE_TIMELINE_ID_EXTRACTOR.search(res.stdout) - def init(self, config_toml: str) -> 'subprocess.CompletedProcess[str]': + created_timeline_id = None + if matches is not None: + created_timeline_id = matches.group('timeline_id') + + if created_timeline_id is None: + raise Exception('could not find timeline id after `zenith timeline create` invocation') + else: + return uuid.UUID(created_timeline_id) + + def list_timelines(self, tenant_id: Optional[uuid.UUID] = None) -> List[Tuple[str, str]]: + """ + Returns a list of (branch_name, timeline_id) tuples out of parsed `zenith timeline list` CLI output. + """ + + # (L) main [b49f7954224a0ad25cc0013ea107b54b] + # (L) ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + res = self.raw_cli( + ['timeline', 'list', '--tenant-id', (tenant_id or self.env.initial_tenant).hex]) + timelines_cli = sorted( + map(lambda branch_and_id: (branch_and_id[0], branch_and_id[1]), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout))) + return timelines_cli + + def init(self, + config_toml: str, + initial_timeline_id: Optional[uuid.UUID] = None) -> 'subprocess.CompletedProcess[str]': with tempfile.NamedTemporaryFile(mode='w+') as tmp: tmp.write(config_toml) tmp.flush() cmd = ['init', f'--config={tmp.name}'] + if initial_timeline_id: + cmd.extend(['--timeline-id', initial_timeline_id.hex]) append_pageserver_param_overrides(cmd, self.env.pageserver.remote_storage, self.env.pageserver.config_override) - return self.raw_cli(cmd) + res = self.raw_cli(cmd) + res.check_returncode() + return res - def pageserver_start(self) -> 'subprocess.CompletedProcess[str]': - start_args = ['pageserver', 'start'] + def pageserver_start(self, overrides=()) -> 'subprocess.CompletedProcess[str]': + start_args = ['pageserver', 'start', *overrides] append_pageserver_param_overrides(start_args, self.env.pageserver.remote_storage, self.env.pageserver.config_override) @@ -862,53 +945,69 @@ class ZenithCli: log.info(f"Stopping pageserver with {cmd}") return self.raw_cli(cmd) - def safekeeper_start(self, name: str) -> 'subprocess.CompletedProcess[str]': - return self.raw_cli(['safekeeper', 'start', name]) + def safekeeper_start(self, id: int) -> 'subprocess.CompletedProcess[str]': + return self.raw_cli(['safekeeper', 'start', str(id)]) def safekeeper_stop(self, - name: Optional[str] = None, + id: Optional[int] = None, immediate=False) -> 'subprocess.CompletedProcess[str]': args = ['safekeeper', 'stop'] + if id is not None: + args.extend(str(id)) if immediate: args.extend(['-m', 'immediate']) - if name is not None: - args.append(name) return self.raw_cli(args) def pg_create( self, - node_name: str, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'create'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'create', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + '--branch-name', + branch_name, + ] + if lsn is not None: + args.extend(['--lsn', lsn]) if port is not None: - args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) - return self.raw_cli(args) + args.extend(['--port', str(port)]) + if node_name is not None: + args.append(node_name) + + res = self.raw_cli(args) + res.check_returncode() + return res def pg_start( self, node_name: str, tenant_id: Optional[uuid.UUID] = None, - timeline_spec: Optional[str] = None, + lsn: Optional[str] = None, port: Optional[int] = None, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'start'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'start', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] + if lsn is not None: + args.append(f'--lsn={lsn}') if port is not None: args.append(f'--port={port}') - args.append(node_name) - if timeline_spec is not None: - args.append(timeline_spec) + if node_name is not None: + args.append(node_name) - return self.raw_cli(args) + res = self.raw_cli(args) + res.check_returncode() + return res def pg_stop( self, @@ -916,12 +1015,16 @@ class ZenithCli: tenant_id: Optional[uuid.UUID] = None, destroy=False, ) -> 'subprocess.CompletedProcess[str]': - args = ['pg', 'stop'] - if tenant_id is not None: - args.extend(['--tenantid', tenant_id.hex]) + args = [ + 'pg', + 'stop', + '--tenant-id', + (tenant_id or self.env.initial_tenant).hex, + ] if destroy: args.append('--destroy') - args.append(node_name) + if node_name is not None: + args.append(node_name) return self.raw_cli(args) @@ -996,8 +1099,7 @@ class ZenithPageserver(PgProtocol): env: ZenithEnv, port: PageserverPort, remote_storage: Optional[RemoteStorage] = None, - config_override: Optional[str] = None, - enable_auth=False): + config_override: Optional[str] = None): super().__init__(host='localhost', port=port.pg, username='zenith_admin') self.env = env self.running = False @@ -1005,14 +1107,15 @@ class ZenithPageserver(PgProtocol): self.remote_storage = remote_storage self.config_override = config_override - def start(self) -> 'ZenithPageserver': + def start(self, overrides=()) -> 'ZenithPageserver': """ Start the page server. + `overrides` allows to add some config to this pageserver start. Returns self. """ assert self.running == False - self.env.zenith_cli.pageserver_start() + self.env.zenith_cli.pageserver_start(overrides=overrides) self.running = True return self @@ -1024,7 +1127,6 @@ class ZenithPageserver(PgProtocol): if self.running: self.env.zenith_cli.pageserver_stop(immediate) self.running = False - return self def __enter__(self): @@ -1085,7 +1187,7 @@ class PgBin: self.env = os.environ.copy() self.env['LD_LIBRARY_PATH'] = os.path.join(str(pg_distrib_dir), 'lib') - def _fixpath(self, command: List[str]) -> None: + def _fixpath(self, command: List[str]): if '/' not in command[0]: command[0] = os.path.join(self.pg_bin_path, command[0]) @@ -1096,7 +1198,7 @@ class PgBin: env.update(env_add) return env - def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None) -> None: + def run(self, command: List[str], env: Optional[Env] = None, cwd: Optional[str] = None): """ Run one of the postgres binaries. @@ -1146,18 +1248,18 @@ class VanillaPostgres(PgProtocol): self.running = False self.pg_bin.run_capture(['initdb', '-D', pgdatadir]) - def configure(self, options: List[str]) -> None: + def configure(self, options: List[str]): """Append lines into postgresql.conf file.""" assert not self.running with open(os.path.join(self.pgdatadir, 'postgresql.conf'), 'a') as conf_file: conf_file.writelines(options) - def start(self) -> None: + def start(self): assert not self.running self.running = True self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'start']) - def stop(self) -> None: + def stop(self): assert self.running self.running = False self.pg_bin.run_capture(['pg_ctl', '-D', self.pgdatadir, 'stop']) @@ -1240,8 +1342,9 @@ class Postgres(PgProtocol): def create( self, - node_name: str, - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1252,19 +1355,21 @@ class Postgres(PgProtocol): if not config_lines: config_lines = [] - if branch is None: - branch = node_name - - self.env.zenith_cli.pg_create(node_name, + self.node_name = node_name or f'{branch_name}_pg_node' + self.env.zenith_cli.pg_create(branch_name, + node_name=self.node_name, tenant_id=self.tenant_id, - port=self.port, - timeline_spec=branch) - self.node_name = node_name + lsn=lsn, + port=self.port) path = pathlib.Path('pgdatadirs') / 'tenants' / self.tenant_id.hex / self.node_name self.pgdata_dir = os.path.join(self.env.repo_dir, path) if config_lines is None: config_lines = [] + + # set small 'max_replication_write_lag' to enable backpressure + # and make tests more stable. + config_lines = ['max_replication_write_lag=15MB'] + config_lines self.config(config_lines) return self @@ -1351,7 +1456,7 @@ class Postgres(PgProtocol): if self.running: assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, tenant_id=self.tenant_id) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id) self.running = False return self @@ -1363,15 +1468,16 @@ class Postgres(PgProtocol): """ assert self.node_name is not None - self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, destroy=True) + self.env.zenith_cli.pg_stop(self.node_name, self.tenant_id, True) self.node_name = None return self def create_start( self, - node_name: str, - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None, ) -> 'Postgres': """ @@ -1381,9 +1487,10 @@ class Postgres(PgProtocol): """ self.create( + branch_name=branch_name, node_name=node_name, - branch=branch, config_lines=config_lines, + lsn=lsn, ).start() return self @@ -1403,9 +1510,10 @@ class PostgresFactory: self.instances: List[Postgres] = [] def create_start(self, - node_name: str = "main", - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1417,15 +1525,17 @@ class PostgresFactory: self.instances.append(pg) return pg.create_start( + branch_name=branch_name, node_name=node_name, - branch=branch, config_lines=config_lines, + lsn=lsn, ) def create(self, - node_name: str = "main", - branch: Optional[str] = None, + branch_name: str, + node_name: Optional[str] = None, tenant_id: Optional[uuid.UUID] = None, + lsn: Optional[str] = None, config_lines: Optional[List[str]] = None) -> Postgres: pg = Postgres( @@ -1438,8 +1548,9 @@ class PostgresFactory: self.instances.append(pg) return pg.create( + branch_name=branch_name, node_name=node_name, - branch=branch, + lsn=lsn, config_lines=config_lines, ) @@ -1466,12 +1577,14 @@ class Safekeeper: """ An object representing a running safekeeper daemon. """ env: ZenithEnv port: SafekeeperPort - name: str # identifier for logging + id: int auth_token: Optional[str] = None + running: bool = False def start(self) -> 'Safekeeper': - self.env.zenith_cli.safekeeper_start(self.name) - + assert self.running == False + self.env.zenith_cli.safekeeper_start(self.id) + self.running = True # wait for wal acceptor start by checking its status started_at = time.time() while True: @@ -1489,8 +1602,9 @@ class Safekeeper: return self def stop(self, immediate=False) -> 'Safekeeper': - log.info('Stopping safekeeper {}'.format(self.name)) - self.env.zenith_cli.safekeeper_stop(self.name, immediate) + log.info('Stopping safekeeper {}'.format(self.id)) + self.env.zenith_cli.safekeeper_stop(self.id, immediate) + self.running = False return self def append_logical_message(self, @@ -1539,7 +1653,7 @@ class SafekeeperMetrics: class SafekeeperHttpClient(requests.Session): - def __init__(self, port: int) -> None: + def __init__(self, port: int): super().__init__() self.port = port @@ -1657,7 +1771,7 @@ def list_files_to_compare(pgdata_dir: str): # pg is the existing and running compute node, that we want to compare with a basebackup def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Postgres): - # Get the timeline ID of our branch. We need it for the 'basebackup' command + # Get the timeline ID. We need it for the 'basebackup' command with closing(pg.connect()) as conn: with conn.cursor() as cur: cur.execute("SHOW zenith.zenith_timeline") diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 6fd77f3020..fbef131ffd 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -23,28 +23,23 @@ def test_bulk_tenant_create( """Measure tenant creation time (with and without wal acceptors)""" if use_wal_acceptors == 'with_wa': zenith_env_builder.num_safekeepers = 3 - env = zenith_env_builder.init() + env = zenith_env_builder.init_start() time_slices = [] for i in range(tenants_count): start = timeit.default_timer() - tenant = env.create_tenant() - env.zenith_cli.create_branch( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - "main", - tenant_id=tenant) + tenant = env.zenith_cli.create_tenant() + env.zenith_cli.create_timeline( + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? #if use_wal_acceptors == 'with_wa': # wa_factory.start_n_new(3) pg_tenant = env.postgres.create_start( - f"test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}", - None, # branch name, None means same as node name - tenant, - ) + f'test_bulk_tenant_create_{tenants_count}_{i}_{use_wal_acceptors}', tenant_id=tenant) end = timeit.default_timer() time_slices.append(end - start) diff --git a/vendor/postgres b/vendor/postgres index 31dc24ab29..093aa160e5 160000 --- a/vendor/postgres +++ b/vendor/postgres @@ -1 +1 @@ -Subproject commit 31dc24ab29e6bdd5cfb85920a9c728f759c01b29 +Subproject commit 093aa160e5df19814ff19b995d36dd5ee03c7f8b diff --git a/walkeeper/src/bin/safekeeper.rs b/walkeeper/src/bin/safekeeper.rs index ea5d0cba14..6c45115e5f 100644 --- a/walkeeper/src/bin/safekeeper.rs +++ b/walkeeper/src/bin/safekeeper.rs @@ -1,17 +1,19 @@ // // Main entry point for the safekeeper executable // -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use clap::{App, Arg}; use const_format::formatcp; use daemonize::Daemonize; use fs2::FileExt; -use std::fs::File; +use std::fs::{self, File}; +use std::io::{ErrorKind, Write}; use std::path::{Path, PathBuf}; use std::thread; use tracing::*; -use walkeeper::control_file::{self, CreateControlFile}; +use walkeeper::control_file::{self}; use zenith_utils::http::endpoint; +use zenith_utils::zid::ZNodeId; use zenith_utils::{logging, tcp_listener, GIT_VERSION}; use tokio::sync::mpsc; @@ -25,6 +27,7 @@ use zenith_utils::shutdown::exit_now; use zenith_utils::signals; const LOCK_FILE_NAME: &str = "safekeeper.lock"; +const ID_FILE_NAME: &str = "safekeeper.id"; fn main() -> Result<()> { zenith_metrics::set_common_metrics_prefix("safekeeper"); @@ -38,6 +41,12 @@ fn main() -> Result<()> { .takes_value(true) .help("Path to the safekeeper data directory"), ) + .arg( + Arg::new("init") + .long("init") + .takes_value(false) + .help("Initialize safekeeper with ID"), + ) .arg( Arg::new("listen-pg") .short('l') @@ -93,13 +102,13 @@ fn main() -> Result<()> { .takes_value(true) .help("Dump control file at path specifed by this argument and exit"), ) + .arg( + Arg::new("id").long("id").takes_value(true).help("safekeeper node id: integer") + ) .get_matches(); if let Some(addr) = arg_matches.value_of("dump-control-file") { - let state = control_file::FileStorage::load_control_file( - Path::new(addr), - CreateControlFile::False, - )?; + let state = control_file::FileStorage::load_control_file(Path::new(addr))?; let json = serde_json::to_string(&state)?; print!("{}", json); return Ok(()); @@ -136,10 +145,19 @@ fn main() -> Result<()> { conf.recall_period = humantime::parse_duration(recall)?; } - start_safekeeper(conf) + let mut given_id = None; + if let Some(given_id_str) = arg_matches.value_of("id") { + given_id = Some(ZNodeId( + given_id_str + .parse() + .context("failed to parse safekeeper id")?, + )); + } + + start_safekeeper(conf, given_id, arg_matches.is_present("init")) } -fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { +fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bool) -> Result<()> { let log_file = logging::init("safekeeper.log", conf.daemonize)?; info!("version: {}", GIT_VERSION); @@ -154,6 +172,12 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { ) })?; + // Set or read our ID. + set_id(&mut conf, given_id)?; + if init { + return Ok(()); + } + let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e @@ -260,3 +284,49 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { std::process::exit(111); }) } + +/// Determine safekeeper id and set it in config. +fn set_id(conf: &mut SafeKeeperConf, given_id: Option) -> Result<()> { + let id_file_path = conf.workdir.join(ID_FILE_NAME); + + let my_id: ZNodeId; + // If ID exists, read it in; otherwise set one passed + match fs::read(&id_file_path) { + Ok(id_serialized) => { + my_id = ZNodeId( + std::str::from_utf8(&id_serialized) + .context("failed to parse safekeeper id")? + .parse() + .context("failed to parse safekeeper id")?, + ); + if let Some(given_id) = given_id { + if given_id != my_id { + bail!( + "safekeeper already initialized with id {}, can't set {}", + my_id, + given_id + ); + } + } + info!("safekeeper ID {}", my_id); + } + Err(error) => match error.kind() { + ErrorKind::NotFound => { + my_id = if let Some(given_id) = given_id { + given_id + } else { + bail!("safekeeper id is not specified"); + }; + let mut f = File::create(&id_file_path)?; + f.write_all(my_id.to_string().as_bytes())?; + f.sync_all()?; + info!("initialized safekeeper ID {}", my_id); + } + _ => { + return Err(error.into()); + } + }, + } + conf.my_id = my_id; + Ok(()) +} diff --git a/walkeeper/src/control_file.rs b/walkeeper/src/control_file.rs index 6016e00d1d..8b4e618661 100644 --- a/walkeeper/src/control_file.rs +++ b/walkeeper/src/control_file.rs @@ -27,13 +27,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control"; const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial"; pub const CHECKSUM_SIZE: usize = std::mem::size_of::(); -// A named boolean. -#[derive(Debug)] -pub enum CreateControlFile { - True, - False, -} - lazy_static! { static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!( "safekeeper_persist_control_file_seconds", @@ -94,28 +87,22 @@ impl FileStorage { pub fn load_control_file_conf( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result { let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME); - Self::load_control_file(path, create) + Self::load_control_file(path) } /// Read in the control file. /// If create=false and file doesn't exist, bails out. - pub fn load_control_file>( - control_file_path: P, - create: CreateControlFile, - ) -> Result { + pub fn load_control_file>(control_file_path: P) -> Result { info!( - "loading control file {}, create={:?}", + "loading control file {}", control_file_path.as_ref().display(), - create, ); let mut control_file = OpenOptions::new() .read(true) .write(true) - .create(matches!(create, CreateControlFile::True)) .open(&control_file_path) .with_context(|| { format!( @@ -124,41 +111,32 @@ impl FileStorage { ) })?; - // Empty file is legit on 'create', don't try to deser from it. - let state = if control_file.metadata().unwrap().len() == 0 { - if let CreateControlFile::False = create { - bail!("control file is empty"); - } - SafeKeeperState::new() - } else { - let mut buf = Vec::new(); - control_file - .read_to_end(&mut buf) - .context("failed to read control file")?; + let mut buf = Vec::new(); + control_file + .read_to_end(&mut buf) + .context("failed to read control file")?; - let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); + let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]); - let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = - buf[buf.len() - CHECKSUM_SIZE..].try_into()?; - let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); + let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] = + buf[buf.len() - CHECKSUM_SIZE..].try_into()?; + let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes); - ensure!( - calculated_checksum == expected_checksum, + ensure!( + calculated_checksum == expected_checksum, + format!( + "safekeeper control file checksum mismatch: expected {} got {}", + expected_checksum, calculated_checksum + ) + ); + + let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]) + .with_context(|| { format!( - "safekeeper control file checksum mismatch: expected {} got {}", - expected_checksum, calculated_checksum + "while reading control file {}", + control_file_path.as_ref().display(), ) - ); - - FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context( - || { - format!( - "while reading control file {}", - control_file_path.as_ref().display(), - ) - }, - )? - }; + })?; Ok(state) } } @@ -247,31 +225,38 @@ mod test { fn load_from_control_file( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, ) -> Result<(FileStorage, SafeKeeperState)> { fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); Ok(( FileStorage::new(zttid, conf), - FileStorage::load_control_file_conf(conf, zttid, create)?, + FileStorage::load_control_file_conf(conf, zttid)?, )) } + fn create( + conf: &SafeKeeperConf, + zttid: &ZTenantTimelineId, + ) -> Result<(FileStorage, SafeKeeperState)> { + fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir"); + let state = SafeKeeperState::empty(); + let mut storage = FileStorage::new(zttid, conf); + storage.persist(&state)?; + Ok((storage, state)) + } + #[test] fn test_read_write_safekeeper_state() { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state"); // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } - let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False) - .expect("failed to read state"); - assert_eq!(state.wal_start_lsn, Lsn(42)); + let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state"); + assert_eq!(state.commit_lsn, Lsn(42)); } #[test] @@ -279,11 +264,10 @@ mod test { let conf = stub_conf(); let zttid = ZTenantTimelineId::generate(); { - let (mut storage, mut state) = - load_from_control_file(&conf, &zttid, CreateControlFile::True) - .expect("failed to read state"); + let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state"); + // change something - state.wal_start_lsn = Lsn(42); + state.commit_lsn = Lsn(42); storage.persist(&state).expect("failed to persist state"); } let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME); @@ -291,7 +275,7 @@ mod test { data[0] += 1; // change the first byte of the file to fail checksum validation fs::write(&control_path, &data).expect("failed to write control file"); - match load_from_control_file(&conf, &zttid, CreateControlFile::False) { + match load_from_control_file(&conf, &zttid) { Err(err) => assert!(err .to_string() .contains("safekeeper control file checksum mismatch")), diff --git a/walkeeper/src/control_file_upgrade.rs b/walkeeper/src/control_file_upgrade.rs index 913bd02c1e..9effe42f8d 100644 --- a/walkeeper/src/control_file_upgrade.rs +++ b/walkeeper/src/control_file_upgrade.rs @@ -1,6 +1,6 @@ //! Code to deal with safekeeper control file upgrades use crate::safekeeper::{ - AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, + AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry, }; use anyhow::{bail, Result}; use serde::{Deserialize, Serialize}; @@ -26,7 +26,7 @@ struct SafeKeeperStateV1 { /// persistent acceptor state acceptor_state: AcceptorStateV1, /// information about server - server: ServerInfo, + server: ServerInfoV2, /// Unique id of the last *elected* proposer we dealed with. Not needed /// for correctness, exists for monitoring purposes. proposer_uuid: PgUuid, @@ -70,6 +70,39 @@ pub struct SafeKeeperStateV2 { pub wal_start_lsn: Lsn, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct ServerInfoV3 { + /// Postgres server version + pub pg_version: u32, + pub system_id: SystemId, + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, + pub wal_seg_size: u32, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SafeKeeperStateV3 { + /// persistent acceptor state + pub acceptor_state: AcceptorState, + /// information about server + pub server: ServerInfoV3, + /// Unique id of the last *elected* proposer we dealed with. Not needed + /// for correctness, exists for monitoring purposes. + #[serde(with = "hex")] + pub proposer_uuid: PgUuid, + /// part of WAL acknowledged by quorum and available locally + pub commit_lsn: Lsn, + /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone) + pub truncate_lsn: Lsn, + // Safekeeper starts receiving WAL from this LSN, zeros before it ought to + // be skipped during decoding. + pub wal_start_lsn: Lsn, +} + pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result { // migrate to storing full term history if version == 1 { @@ -83,12 +116,20 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result }]), }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: ac, - server: oldstate.server.clone(), + server: ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); // migrate to hexing some zids } else if version == 2 { @@ -97,17 +138,40 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result let server = ServerInfo { pg_version: oldstate.server.pg_version, system_id: oldstate.server.system_id, - tenant_id: oldstate.server.tenant_id, - timeline_id: oldstate.server.ztli, wal_seg_size: oldstate.server.wal_seg_size, }; return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.ztli, acceptor_state: oldstate.acceptor_state, server, proposer_uuid: oldstate.proposer_uuid, commit_lsn: oldstate.commit_lsn, - truncate_lsn: oldstate.truncate_lsn, - wal_start_lsn: oldstate.wal_start_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), + }); + // migrate to moving ztenantid/ztli to the top and adding some lsns + } else if version == 3 { + info!("reading safekeeper control file version {}", version); + let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?; + let server = ServerInfo { + pg_version: oldstate.server.pg_version, + system_id: oldstate.server.system_id, + wal_seg_size: oldstate.server.wal_seg_size, + }; + return Ok(SafeKeeperState { + tenant_id: oldstate.server.tenant_id, + timeline_id: oldstate.server.timeline_id, + acceptor_state: oldstate.acceptor_state, + server, + proposer_uuid: oldstate.proposer_uuid, + commit_lsn: oldstate.commit_lsn, + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: oldstate.truncate_lsn, + remote_consistent_lsn: Lsn(0), + peers: Peers(vec![]), }); } bail!("unsupported safekeeper control file version {}", version) diff --git a/walkeeper/src/handler.rs b/walkeeper/src/handler.rs index d1ead5cb37..ead6fab9fb 100644 --- a/walkeeper/src/handler.rs +++ b/walkeeper/src/handler.rs @@ -13,6 +13,7 @@ use postgres_ffi::xlog_utils::PG_TLI; use regex::Regex; use std::str::FromStr; use std::sync::Arc; +use tracing::info; use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend; use zenith_utils::postgres_backend::PostgresBackend; @@ -20,7 +21,6 @@ use zenith_utils::pq_proto::{BeMessage, FeStartupPacket, RowDescriptor, INT4_OID use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId}; use crate::callmemaybe::CallmeEvent; -use crate::control_file::CreateControlFile; use tokio::sync::mpsc::UnboundedSender; /// Safekeeper handler of postgres commands @@ -101,29 +101,19 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler { fn process_query(&mut self, pgb: &mut PostgresBackend, query_string: &str) -> Result<()> { let cmd = parse_cmd(query_string)?; - // Is this command is ztimeline scoped? - match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::StartReplication { .. } - | SafekeeperPostgresCommand::IdentifySystem - | SafekeeperPostgresCommand::JSONCtrl { .. } => { - let tenantid = self.ztenantid.context("tenantid is required")?; - let timelineid = self.ztimelineid.context("timelineid is required")?; - if self.timeline.is_none() { - // START_WAL_PUSH is the only command that initializes the timeline in production. - // There is also JSON_CTRL command, which should initialize the timeline for testing. - let create_control_file = match cmd { - SafekeeperPostgresCommand::StartWalPush { .. } - | SafekeeperPostgresCommand::JSONCtrl { .. } => CreateControlFile::True, - _ => CreateControlFile::False, - }; - self.timeline.set( - &self.conf, - ZTenantTimelineId::new(tenantid, timelineid), - create_control_file, - )?; - } - } + info!("got query {:?}", query_string); + + let create = !(matches!(cmd, SafekeeperPostgresCommand::StartReplication { .. }) + || matches!(cmd, SafekeeperPostgresCommand::IdentifySystem)); + + let tenantid = self.ztenantid.context("tenantid is required")?; + let timelineid = self.ztimelineid.context("timelineid is required")?; + if self.timeline.is_none() { + self.timeline.set( + &self.conf, + ZTenantTimelineId::new(tenantid, timelineid), + create, + )?; } match cmd { diff --git a/walkeeper/src/http/mod.rs b/walkeeper/src/http/mod.rs index c82d1c0362..4c0be17ecd 100644 --- a/walkeeper/src/http/mod.rs +++ b/walkeeper/src/http/mod.rs @@ -1,2 +1,3 @@ +pub mod models; pub mod routes; pub use routes::make_router; diff --git a/walkeeper/src/http/models.rs b/walkeeper/src/http/models.rs new file mode 100644 index 0000000000..8a6ed7a812 --- /dev/null +++ b/walkeeper/src/http/models.rs @@ -0,0 +1,9 @@ +use serde::{Deserialize, Serialize}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId}; + +#[derive(Serialize, Deserialize)] +pub struct TimelineCreateRequest { + pub tenant_id: ZTenantId, + pub timeline_id: ZTimelineId, + pub peer_ids: Vec, +} diff --git a/walkeeper/src/http/routes.rs b/walkeeper/src/http/routes.rs index 11a29ac6d3..74f7f4a735 100644 --- a/walkeeper/src/http/routes.rs +++ b/walkeeper/src/http/routes.rs @@ -1,13 +1,15 @@ use hyper::{Body, Request, Response, StatusCode}; + use serde::Serialize; use serde::Serializer; use std::fmt::Display; use std::sync::Arc; +use zenith_utils::http::json::json_request; use zenith_utils::http::{RequestExt, RouterBuilder}; use zenith_utils::lsn::Lsn; +use zenith_utils::zid::ZNodeId; use zenith_utils::zid::ZTenantTimelineId; -use crate::control_file::CreateControlFile; use crate::safekeeper::Term; use crate::safekeeper::TermHistory; use crate::timeline::GlobalTimelines; @@ -18,9 +20,18 @@ use zenith_utils::http::json::json_response; use zenith_utils::http::request::parse_request_param; use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use super::models::TimelineCreateRequest; + +#[derive(Debug, Serialize)] +struct SafekeeperStatus { + id: ZNodeId, +} + /// Healthcheck handler. -async fn status_handler(_: Request) -> Result, ApiError> { - Ok(json_response(StatusCode::OK, "")?) +async fn status_handler(request: Request) -> Result, ApiError> { + let conf = get_conf(&request); + let status = SafekeeperStatus { id: conf.my_id }; + Ok(json_response(StatusCode::OK, status)?) } fn get_conf(request: &Request) -> &SafeKeeperConf { @@ -58,7 +69,11 @@ struct TimelineStatus { #[serde(serialize_with = "display_serialize")] commit_lsn: Lsn, #[serde(serialize_with = "display_serialize")] - truncate_lsn: Lsn, + s3_wal_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + peer_horizon_lsn: Lsn, + #[serde(serialize_with = "display_serialize")] + remote_consistent_lsn: Lsn, #[serde(serialize_with = "display_serialize")] flush_lsn: Lsn, } @@ -70,8 +85,7 @@ async fn timeline_status_handler(request: Request) -> Result) -> Result) -> Result, ApiError> { + let request_data: TimelineCreateRequest = json_request(&mut request).await?; + + let zttid = ZTenantTimelineId { + tenant_id: request_data.tenant_id, + timeline_id: request_data.timeline_id, + }; + GlobalTimelines::create(get_conf(&request), zttid, request_data.peer_ids) + .map_err(ApiError::from_err)?; + + Ok(json_response(StatusCode::CREATED, ())?) +} + /// Safekeeper http router. pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder { let router = endpoint::make_router(); @@ -102,4 +131,5 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder "/v1/timeline/:tenant_id/:timeline_id", timeline_status_handler, ) + .post("/v1/timeline", timeline_create_handler) } diff --git a/walkeeper/src/lib.rs b/walkeeper/src/lib.rs index 6c3e0b264e..dfd71e4de2 100644 --- a/walkeeper/src/lib.rs +++ b/walkeeper/src/lib.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use std::time::Duration; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; pub mod callmemaybe; pub mod control_file; @@ -46,6 +46,7 @@ pub struct SafeKeeperConf { pub listen_http_addr: String, pub ttl: Option, pub recall_period: Duration, + pub my_id: ZNodeId, } impl SafeKeeperConf { @@ -69,6 +70,7 @@ impl Default for SafeKeeperConf { listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), ttl: None, recall_period: defaults::DEFAULT_RECALL_PERIOD, + my_id: ZNodeId(0), } } } diff --git a/walkeeper/src/safekeeper.rs b/walkeeper/src/safekeeper.rs index fa624bb18f..53fd6f5588 100644 --- a/walkeeper/src/safekeeper.rs +++ b/walkeeper/src/safekeeper.rs @@ -10,6 +10,8 @@ use std::cmp::min; use std::fmt; use std::io::Read; use tracing::*; +use zenith_utils::zid::ZNodeId; +use zenith_utils::zid::ZTenantTimelineId; use lazy_static::lazy_static; @@ -25,12 +27,13 @@ use zenith_utils::pq_proto::ZenithFeedback; use zenith_utils::zid::{ZTenantId, ZTimelineId}; pub const SK_MAGIC: u32 = 0xcafeceefu32; -pub const SK_FORMAT_VERSION: u32 = 3; +pub const SK_FORMAT_VERSION: u32 = 4; const SK_PROTOCOL_VERSION: u32 = 1; const UNKNOWN_SERVER_VERSION: u32 = 0; /// Consensus logical timestamp. pub type Term = u64; +const INVALID_TERM: Term = 0; #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct TermSwitchEntry { @@ -128,18 +131,47 @@ pub struct ServerInfo { /// Postgres server version pub pg_version: u32, pub system_id: SystemId, - #[serde(with = "hex")] - pub tenant_id: ZTenantId, - /// Zenith timelineid - #[serde(with = "hex")] - pub timeline_id: ZTimelineId, pub wal_seg_size: u32, } +/// Data published by safekeeper to the peers +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PeerInfo { + /// LSN up to which safekeeper offloaded WAL to s3. + s3_wal_lsn: Lsn, + /// Term of the last entry. + term: Term, + /// LSN of the last record. + flush_lsn: Lsn, + /// Up to which LSN safekeeper regards its WAL as committed. + commit_lsn: Lsn, +} + +impl PeerInfo { + fn new() -> Self { + Self { + s3_wal_lsn: Lsn(0), + term: INVALID_TERM, + flush_lsn: Lsn(0), + commit_lsn: Lsn(0), + } + } +} + +// vector-based node id -> peer state map with very limited functionality we +// need/ +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Peers(pub Vec<(ZNodeId, PeerInfo)>); + /// Persistent information stored on safekeeper node /// On disk data is prefixed by magic and format version and followed by checksum. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SafeKeeperState { + #[serde(with = "hex")] + pub tenant_id: ZTenantId, + /// Zenith timelineid + #[serde(with = "hex")] + pub timeline_id: ZTimelineId, /// persistent acceptor state pub acceptor_state: AcceptorState, /// information about server @@ -148,19 +180,33 @@ pub struct SafeKeeperState { /// for correctness, exists for monitoring purposes. #[serde(with = "hex")] pub proposer_uuid: PgUuid, - /// part of WAL acknowledged by quorum and available locally + /// Part of WAL acknowledged by quorum and available locally. Always points + /// to record boundary. pub commit_lsn: Lsn, - /// minimal LSN which may be needed for recovery of some safekeeper (end_lsn - /// of last record streamed to everyone) - pub truncate_lsn: Lsn, - // Safekeeper starts receiving WAL from this LSN, zeros before it ought to - // be skipped during decoding. - pub wal_start_lsn: Lsn, + /// First LSN not yet offloaded to s3. Useful to persist to avoid finding + /// out offloading progress on boot. + pub s3_wal_lsn: Lsn, + /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn + /// of last record streamed to everyone). Persisting it helps skipping + /// recovery in walproposer, generally we compute it from peers. In + /// walproposer proto called 'truncate_lsn'. + pub peer_horizon_lsn: Lsn, + /// LSN of the oldest known checkpoint made by pageserver and successfully + /// pushed to s3. We don't remove WAL beyond it. Persisted only for + /// informational purposes, we receive it from pageserver. + pub remote_consistent_lsn: Lsn, + // Peers and their state as we remember it. Knowing peers themselves is + // fundamental; but state is saved here only for informational purposes and + // obviously can be stale. (Currently not saved at all, but let's provision + // place to have less file version upgrades). + pub peers: Peers, } impl SafeKeeperState { - pub fn new() -> SafeKeeperState { + pub fn new(zttid: &ZTenantTimelineId, peers: Vec) -> SafeKeeperState { SafeKeeperState { + tenant_id: zttid.tenant_id, + timeline_id: zttid.timeline_id, acceptor_state: AcceptorState { term: 0, term_history: TermHistory::empty(), @@ -168,21 +214,20 @@ impl SafeKeeperState { server: ServerInfo { pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */ system_id: 0, /* Postgres system identifier */ - tenant_id: ZTenantId::from([0u8; 16]), - timeline_id: ZTimelineId::from([0u8; 16]), wal_seg_size: 0, }, proposer_uuid: [0; 16], - commit_lsn: Lsn(0), /* part of WAL acknowledged by quorum */ - truncate_lsn: Lsn(0), /* minimal LSN which may be needed for recovery of some safekeeper */ - wal_start_lsn: Lsn(0), + commit_lsn: Lsn(0), + s3_wal_lsn: Lsn(0), + peer_horizon_lsn: Lsn(0), + remote_consistent_lsn: Lsn(0), + peers: Peers(peers.iter().map(|p| (*p, PeerInfo::new())).collect()), } } -} -impl Default for SafeKeeperState { - fn default() -> Self { - Self::new() + #[cfg(test)] + pub fn empty() -> Self { + SafeKeeperState::new(&ZTenantTimelineId::empty(), vec![]) } } @@ -421,6 +466,7 @@ lazy_static! { struct SafeKeeperMetrics { commit_lsn: Gauge, + // WAL-related metrics are in WalStorageMetrics } impl SafeKeeperMetrics { @@ -443,7 +489,7 @@ pub struct SafeKeeper { /// not-yet-flushed pairs of same named fields in s.* pub commit_lsn: Lsn, - pub truncate_lsn: Lsn, + pub peer_horizon_lsn: Lsn, pub s: SafeKeeperState, // persistent part pub control_store: CTRL, @@ -462,16 +508,14 @@ where wal_store: WAL, state: SafeKeeperState, ) -> SafeKeeper { - if state.server.timeline_id != ZTimelineId::from([0u8; 16]) - && ztli != state.server.timeline_id - { - panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.server.timeline_id); + if state.timeline_id != ZTimelineId::from([0u8; 16]) && ztli != state.timeline_id { + panic!("Calling SafeKeeper::new with inconsistent ztli ({}) and SafeKeeperState.server.timeline_id ({})", ztli, state.timeline_id); } SafeKeeper { - metrics: SafeKeeperMetrics::new(state.server.tenant_id, ztli, state.commit_lsn), + metrics: SafeKeeperMetrics::new(state.tenant_id, ztli, state.commit_lsn), commit_lsn: state.commit_lsn, - truncate_lsn: state.truncate_lsn, + peer_horizon_lsn: state.peer_horizon_lsn, s: state, control_store, wal_store, @@ -532,12 +576,24 @@ where msg.pg_version, self.s.server.pg_version ); } + if msg.tenant_id != self.s.tenant_id { + bail!( + "invalid tenant ID, got {}, expected {}", + msg.tenant_id, + self.s.tenant_id + ); + } + if msg.ztli != self.s.timeline_id { + bail!( + "invalid timeline ID, got {}, expected {}", + msg.ztli, + self.s.timeline_id + ); + } // set basic info about server, if not yet // TODO: verify that is doesn't change after self.s.server.system_id = msg.system_id; - self.s.server.tenant_id = msg.tenant_id; - self.s.server.timeline_id = msg.ztli; self.s.server.wal_seg_size = msg.wal_seg_size; self.control_store .persist(&self.s) @@ -568,7 +624,7 @@ where term: self.s.acceptor_state.term, vote_given: false as u64, flush_lsn: self.wal_store.flush_lsn(), - truncate_lsn: self.s.truncate_lsn, + truncate_lsn: self.s.peer_horizon_lsn, term_history: self.get_term_history(), }; if self.s.acceptor_state.term < msg.term { @@ -593,14 +649,16 @@ where /// Form AppendResponse from current state. fn append_response(&self) -> AppendResponse { - AppendResponse { + let ar = AppendResponse { term: self.s.acceptor_state.term, flush_lsn: self.wal_store.flush_lsn(), commit_lsn: self.s.commit_lsn, // will be filled by the upper code to avoid bothering safekeeper hs_feedback: HotStandbyFeedback::empty(), zenith_feedback: ZenithFeedback::empty(), - } + }; + trace!("formed AppendResponse {:?}", ar); + ar } fn handle_elected(&mut self, msg: &ProposerElected) -> Result> { @@ -655,10 +713,11 @@ where if !msg.wal_data.is_empty() { self.wal_store.write_wal(msg.h.begin_lsn, &msg.wal_data)?; - // If this was the first record we ever receieved, remember LSN to help - // find_end_of_wal skip the hole in the beginning. - if self.s.wal_start_lsn == Lsn(0) { - self.s.wal_start_lsn = msg.h.begin_lsn; + // If this was the first record we ever receieved, initialize + // commit_lsn to help find_end_of_wal skip the hole in the + // beginning. + if self.s.commit_lsn == Lsn(0) { + self.s.commit_lsn = msg.h.begin_lsn; sync_control_file = true; require_flush = true; } @@ -685,35 +744,36 @@ where .set(u64::from(self.commit_lsn) as f64); } - self.truncate_lsn = msg.h.truncate_lsn; + self.peer_horizon_lsn = msg.h.truncate_lsn; // Update truncate and commit LSN in control file. // To avoid negative impact on performance of extra fsync, do it only // when truncate_lsn delta exceeds WAL segment size. sync_control_file |= - self.s.truncate_lsn + (self.s.server.wal_seg_size as u64) < self.truncate_lsn; + self.s.peer_horizon_lsn + (self.s.server.wal_seg_size as u64) < self.peer_horizon_lsn; if sync_control_file { self.s.commit_lsn = self.commit_lsn; - self.s.truncate_lsn = self.truncate_lsn; + self.s.peer_horizon_lsn = self.peer_horizon_lsn; } if sync_control_file { self.control_store.persist(&self.s)?; } + trace!( + "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}", + msg.wal_data.len(), + msg.h.end_lsn, + msg.h.commit_lsn, + msg.h.truncate_lsn, + require_flush, + ); + // If flush_lsn hasn't updated, AppendResponse is not very useful. if !require_flush { return Ok(None); } let resp = self.append_response(); - trace!( - "processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}", - msg.wal_data.len(), - msg.h.end_lsn, - msg.h.commit_lsn, - msg.h.truncate_lsn, - &resp, - ); Ok(Some(AcceptorProposerMessage::AppendResponse(resp))) } @@ -774,11 +834,11 @@ mod tests { #[test] fn test_voting() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); // check voting for 1 is ok let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 }); @@ -806,11 +866,11 @@ mod tests { #[test] fn test_epoch_switch() { let storage = InMemoryState { - persisted_state: SafeKeeperState::new(), + persisted_state: SafeKeeperState::empty(), }; let wal_store = DummyWalStore { lsn: Lsn(0) }; let ztli = ZTimelineId::from([0u8; 16]); - let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::new()); + let mut sk = SafeKeeper::new(ztli, storage, wal_store, SafeKeeperState::empty()); let mut ar_hdr = AppendRequestHeader { term: 1, diff --git a/walkeeper/src/timeline.rs b/walkeeper/src/timeline.rs index c639e81b79..ea8308b95e 100644 --- a/walkeeper/src/timeline.rs +++ b/walkeeper/src/timeline.rs @@ -1,7 +1,7 @@ //! This module contains timeline id -> safekeeper state map with file-backed //! persistence and support for interaction between sending and receiving wal. -use anyhow::{Context, Result}; +use anyhow::{bail, Context, Result}; use lazy_static::lazy_static; @@ -9,22 +9,24 @@ use std::cmp::{max, min}; use std::collections::HashMap; use std::fs::{self}; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::{Arc, Condvar, Mutex, MutexGuard}; use std::time::Duration; use tokio::sync::mpsc::UnboundedSender; use tracing::*; use zenith_utils::lsn::Lsn; -use zenith_utils::zid::ZTenantTimelineId; +use zenith_utils::zid::{ZNodeId, ZTenantTimelineId}; use crate::callmemaybe::{CallmeEvent, SubscriptionStateKey}; -use crate::control_file::{self, CreateControlFile}; +use crate::control_file; +use crate::control_file::Storage as cf_storage; use crate::safekeeper::{ AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, SafeKeeperState, }; use crate::send_wal::HotStandbyFeedback; -use crate::wal_storage::{self, Storage}; +use crate::wal_storage; +use crate::wal_storage::Storage as wal_storage_iface; use crate::SafeKeeperConf; use zenith_utils::pq_proto::ZenithFeedback; @@ -87,21 +89,39 @@ struct SharedState { } impl SharedState { - /// Restore SharedState from control file. - /// If create=false and file doesn't exist, bails out. - fn create_restore( + /// Initialize timeline state, creating control file + fn create( conf: &SafeKeeperConf, zttid: &ZTenantTimelineId, - create: CreateControlFile, + peer_ids: Vec, ) -> Result { - let state = control_file::FileStorage::load_control_file_conf(conf, zttid, create) + let state = SafeKeeperState::new(zttid, peer_ids); + let control_store = control_file::FileStorage::new(zttid, conf); + let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); + let mut sk = SafeKeeper::new(zttid.timeline_id, control_store, wal_store, state); + sk.control_store.persist(&sk.s)?; + + Ok(Self { + notified_commit_lsn: Lsn(0), + sk, + replicas: Vec::new(), + active: false, + num_computes: 0, + pageserver_connstr: None, + }) + } + + /// Restore SharedState from control file. + /// If file doesn't exist, bails out. + fn restore(conf: &SafeKeeperConf, zttid: &ZTenantTimelineId) -> Result { + let state = control_file::FileStorage::load_control_file_conf(conf, zttid) .context("failed to load from control file")?; let control_store = control_file::FileStorage::new(zttid, conf); let wal_store = wal_storage::PhysicalStorage::new(zttid, conf); - info!("timeline {} created or restored", zttid.timeline_id); + info!("timeline {} restored", zttid.timeline_id); Ok(Self { notified_commit_lsn: Lsn(0), @@ -418,26 +438,13 @@ impl Timeline { // Utilities needed by various Connection-like objects pub trait TimelineTools { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()>; + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()>; fn get(&self) -> &Arc; } impl TimelineTools for Option> { - fn set( - &mut self, - conf: &SafeKeeperConf, - zttid: ZTenantTimelineId, - create: CreateControlFile, - ) -> Result<()> { - // We will only set the timeline once. If it were to ever change, - // anyone who cloned the Arc would be out of date. - assert!(self.is_none()); + fn set(&mut self, conf: &SafeKeeperConf, zttid: ZTenantTimelineId, create: bool) -> Result<()> { *self = Some(GlobalTimelines::get(conf, zttid, create)?); Ok(()) } @@ -456,30 +463,73 @@ lazy_static! { pub struct GlobalTimelines; impl GlobalTimelines { + fn create_internal( + mut timelines: MutexGuard>>, + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + match timelines.get(&zttid) { + Some(_) => bail!("timeline {} already exists", zttid), + None => { + // TODO: check directory existence + let dir = conf.timeline_dir(&zttid); + fs::create_dir_all(dir)?; + let shared_state = SharedState::create(conf, &zttid, peer_ids) + .context("failed to create shared state")?; + + let new_tli = Arc::new(Timeline::new(zttid, shared_state)); + timelines.insert(zttid, Arc::clone(&new_tli)); + Ok(new_tli) + } + } + } + + pub fn create( + conf: &SafeKeeperConf, + zttid: ZTenantTimelineId, + peer_ids: Vec, + ) -> Result> { + let timelines = TIMELINES.lock().unwrap(); + GlobalTimelines::create_internal(timelines, conf, zttid, peer_ids) + } + /// Get a timeline with control file loaded from the global TIMELINES map. - /// If control file doesn't exist and create=false, bails out. + /// If control file doesn't exist, bails out. pub fn get( conf: &SafeKeeperConf, zttid: ZTenantTimelineId, - create: CreateControlFile, + create: bool, ) -> Result> { let mut timelines = TIMELINES.lock().unwrap(); match timelines.get(&zttid) { Some(result) => Ok(Arc::clone(result)), None => { - if let CreateControlFile::True = create { - let dir = conf.timeline_dir(&zttid); - info!( - "creating timeline dir {}, create is {:?}", - dir.display(), - create - ); - fs::create_dir_all(dir)?; - } + let shared_state = + SharedState::restore(conf, &zttid).context("failed to restore shared state"); - let shared_state = SharedState::create_restore(conf, &zttid, create) - .context("failed to restore shared state")?; + let shared_state = match shared_state { + Ok(shared_state) => shared_state, + Err(error) => { + // TODO: always create timeline explicitly + if error + .root_cause() + .to_string() + .contains("No such file or directory") + && create + { + return GlobalTimelines::create_internal( + timelines, + conf, + zttid, + vec![], + ); + } else { + return Err(error); + } + } + }; let new_tli = Arc::new(Timeline::new(zttid, shared_state)); timelines.insert(zttid, Arc::clone(&new_tli)); diff --git a/walkeeper/src/wal_storage.rs b/walkeeper/src/wal_storage.rs index 73eccd0ae8..7cef525bee 100644 --- a/walkeeper/src/wal_storage.rs +++ b/walkeeper/src/wal_storage.rs @@ -301,7 +301,8 @@ impl Storage for PhysicalStorage { /// allows to postpone its initialization. fn init_storage(&mut self, state: &SafeKeeperState) -> Result<()> { if state.server.wal_seg_size == 0 { - // wal_seg_size is still unknown + // wal_seg_size is still unknown. This is dead path normally, should + // be used only in tests. return Ok(()); } @@ -315,9 +316,13 @@ impl Storage for PhysicalStorage { let wal_seg_size = state.server.wal_seg_size as usize; self.wal_seg_size = Some(wal_seg_size); - // we need to read WAL from disk to know which LSNs are stored on disk - self.write_lsn = - Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.wal_start_lsn)?.0); + // Find out where stored WAL ends, starting at commit_lsn which is a + // known recent record boundary (unless we don't have WAL at all). + self.write_lsn = if state.commit_lsn == Lsn(0) { + Lsn(0) + } else { + Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0) + }; self.write_record_lsn = self.write_lsn; @@ -326,11 +331,13 @@ impl Storage for PhysicalStorage { self.update_flush_lsn(); info!( - "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, truncate_lsn={}", - self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.truncate_lsn, + "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}", + self.zttid.timeline_id, self.flush_record_lsn, state.commit_lsn, state.peer_horizon_lsn, ); - if self.flush_record_lsn < state.commit_lsn || self.flush_record_lsn < state.truncate_lsn { - warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or truncate_lsn from control file", self.zttid.timeline_id); + if self.flush_record_lsn < state.commit_lsn + || self.flush_record_lsn < state.peer_horizon_lsn + { + warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", self.zttid.timeline_id); } Ok(()) diff --git a/zenith/src/main.rs b/zenith/src/main.rs index a2a762f5be..dd35427d5d 100644 --- a/zenith/src/main.rs +++ b/zenith/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, bail, Context, Result}; use clap::{App, AppSettings, Arg, ArgMatches}; use control_plane::compute::ComputeControlPlane; use control_plane::local_env; @@ -9,7 +9,7 @@ use pageserver::config::defaults::{ DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR, DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR, }; -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::process::exit; use std::str::FromStr; use walkeeper::defaults::{ @@ -17,46 +17,53 @@ use walkeeper::defaults::{ DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT, }; use zenith_utils::auth::{Claims, Scope}; +use zenith_utils::lsn::Lsn; use zenith_utils::postgres_backend::AuthType; -use zenith_utils::zid::{ZTenantId, ZTimelineId}; +use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId}; use zenith_utils::GIT_VERSION; -use pageserver::branches::BranchInfo; +use pageserver::timelines::TimelineInfo; -// Default name of a safekeeper node, if not specified on the command line. -const DEFAULT_SAFEKEEPER_NAME: &str = "single"; +// Default id of a safekeeper node, if not specified on the command line. +const DEFAULT_SAFEKEEPER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_PAGESERVER_ID: ZNodeId = ZNodeId(1); +const DEFAULT_BRANCH_NAME: &str = "main"; fn default_conf() -> String { format!( r#" # Default built-in configuration, defined in main.rs [pageserver] +id = {pageserver_id} listen_pg_addr = '{pageserver_pg_addr}' listen_http_addr = '{pageserver_http_addr}' auth_type = '{pageserver_auth_type}' [[safekeepers]] -name = '{safekeeper_name}' +id = {safekeeper_id} pg_port = {safekeeper_pg_port} http_port = {safekeeper_http_port} "#, + pageserver_id = DEFAULT_PAGESERVER_ID, pageserver_pg_addr = DEFAULT_PAGESERVER_PG_ADDR, pageserver_http_addr = DEFAULT_PAGESERVER_HTTP_ADDR, pageserver_auth_type = AuthType::Trust, - safekeeper_name = DEFAULT_SAFEKEEPER_NAME, + safekeeper_id = DEFAULT_SAFEKEEPER_ID, safekeeper_pg_port = DEFAULT_SAFEKEEPER_PG_PORT, safekeeper_http_port = DEFAULT_SAFEKEEPER_HTTP_PORT, ) } /// -/// Branches tree element used as a value in the HashMap. +/// Timelines tree element used as a value in the HashMap. /// -struct BranchTreeEl { - /// `BranchInfo` received from the `pageserver` via the `branch_list` libpq API call. - pub info: BranchInfo, - /// Holds all direct children of this branch referenced using `timeline_id`. - pub children: Vec, +struct TimelineTreeEl { + /// `TimelineInfo` received from the `pageserver` via the `timeline_list` http API call. + pub info: TimelineInfo, + /// Name, recovered from zenith config mappings + pub name: Option, + /// Holds all direct children of this timeline referenced using `timeline_id`. + pub children: BTreeSet, } // Main entry point for the 'zenith' CLI utility @@ -67,29 +74,28 @@ struct BranchTreeEl { // * Providing CLI api to the pageserver // * TODO: export/import to/from usual postgres fn main() -> Result<()> { - #[rustfmt::skip] // rustfmt squashes these into a single line otherwise - let pg_node_arg = Arg::new("node") - .index(1) - .help("Node name") - .required(true); - - #[rustfmt::skip] - let safekeeper_node_arg = Arg::new("node") - .index(1) - .help("Node name") + let branch_name_arg = Arg::new("branch-name") + .long("branch-name") + .takes_value(true) + .help("Name of the branch to be created or used as an alias for other services") .required(false); - let timeline_arg = Arg::new("timeline") - .index(2) - .help("Branch name or a point-in time specification") - .required(false); + let pg_node_arg = Arg::new("node").help("Postgres node name").required(false); - let tenantid_arg = Arg::new("tenantid") - .long("tenantid") + let safekeeper_id_arg = Arg::new("id").help("safekeeper id").required(false); + + let tenant_id_arg = Arg::new("tenant-id") + .long("tenant-id") .help("Tenant id. Represented as a hexadecimal string 32 symbols length") .takes_value(true) .required(false); + let timeline_id_arg = Arg::new("timeline-id") + .long("timeline-id") + .help("Timeline id. Represented as a hexadecimal string 32 symbols length") + .takes_value(true) + .required(false); + let port_arg = Arg::new("port") .long("port") .required(false) @@ -111,6 +117,12 @@ fn main() -> Result<()> { .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") .required(false); + let lsn_arg = Arg::new("lsn") + .long("lsn") + .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") + .takes_value(true) + .required(false); + let matches = App::new("Zenith CLI") .setting(AppSettings::ArgRequiredElseHelp) .version(GIT_VERSION) @@ -118,6 +130,7 @@ fn main() -> Result<()> { App::new("init") .about("Initialize a new Zenith repository") .arg(pageserver_config_args.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) .arg( Arg::new("config") .long("config") @@ -126,17 +139,32 @@ fn main() -> Result<()> { ) ) .subcommand( - App::new("branch") - .about("Create a new branch") - .arg(Arg::new("branchname").required(false).index(1)) - .arg(Arg::new("start-point").required(false).index(2)) - .arg(tenantid_arg.clone()), + App::new("timeline") + .about("Manage timelines") + .subcommand(App::new("list") + .about("List all timelines, available to this pageserver") + .arg(tenant_id_arg.clone())) + .subcommand(App::new("branch") + .about("Create a new timeline, using another timeline as a base, copying its data") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(Arg::new("ancestor-branch-name").long("ancestor-branch-name").takes_value(true) + .help("Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name.").required(false)) + .arg(Arg::new("ancestor-start-lsn").long("ancestor-start-lsn").takes_value(true) + .help("When using another timeline as base, use a specific Lsn in it instead of the latest one").required(false))) + .subcommand(App::new("create") + .about("Create a new blank timeline") + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone())) ).subcommand( App::new("tenant") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage tenants") .subcommand(App::new("list")) - .subcommand(App::new("create").arg(Arg::new("tenantid").required(false).index(1))) + .subcommand(App::new("create") + .arg(tenant_id_arg.clone()) + .arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline")) + ) ) .subcommand( App::new("pageserver") @@ -154,16 +182,16 @@ fn main() -> Result<()> { .about("Manage safekeepers") .subcommand(App::new("start") .about("Start local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) ) .subcommand(App::new("stop") .about("Stop local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) .subcommand(App::new("restart") .about("Restart local safekeeper") - .arg(safekeeper_node_arg.clone()) + .arg(safekeeper_id_arg.clone()) .arg(stop_mode_arg.clone()) ) ) @@ -171,12 +199,13 @@ fn main() -> Result<()> { App::new("pg") .setting(AppSettings::ArgRequiredElseHelp) .about("Manage postgres instances") - .subcommand(App::new("list").arg(tenantid_arg.clone())) + .subcommand(App::new("list").arg(tenant_id_arg.clone())) .subcommand(App::new("create") .about("Create a postgres compute node") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone()) .arg( Arg::new("config-only") @@ -187,20 +216,21 @@ fn main() -> Result<()> { .subcommand(App::new("start") .about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files") .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg(branch_name_arg.clone()) + .arg(timeline_id_arg.clone()) + .arg(lsn_arg.clone()) .arg(port_arg.clone())) .subcommand( App::new("stop") - .arg(pg_node_arg.clone()) - .arg(timeline_arg.clone()) - .arg(tenantid_arg.clone()) - .arg( - Arg::new("destroy") - .help("Also delete data directory (now optional, should be default in future)") - .long("destroy") - .required(false) - ) + .arg(pg_node_arg.clone()) + .arg(tenant_id_arg.clone()) + .arg( + Arg::new("destroy") + .help("Also delete data directory (now optional, should be default in future)") + .long("destroy") + .required(false) + ) ) ) @@ -222,75 +252,89 @@ fn main() -> Result<()> { }; // Check for 'zenith init' command first. - let subcmd_result = if sub_name == "init" { - handle_init(sub_args) + let subcommand_result = if sub_name == "init" { + handle_init(sub_args).map(Some) } else { // all other commands need an existing config - let env = match LocalEnv::load_config() { - Ok(conf) => conf, - Err(e) => { - eprintln!("Error loading config: {}", e); - exit(1); - } - }; + let mut env = LocalEnv::load_config().context("Error loading config")?; + let original_env = env.clone(); - match sub_name { - "tenant" => handle_tenant(sub_args, &env), - "branch" => handle_branch(sub_args, &env), + let subcommand_result = match sub_name { + "tenant" => handle_tenant(sub_args, &mut env), + "timeline" => handle_timeline(sub_args, &mut env), "start" => handle_start_all(sub_args, &env), "stop" => handle_stop_all(sub_args, &env), "pageserver" => handle_pageserver(sub_args, &env), "pg" => handle_pg(sub_args, &env), "safekeeper" => handle_safekeeper(sub_args, &env), _ => bail!("unexpected subcommand {}", sub_name), + }; + + if original_env != env { + subcommand_result.map(|()| Some(env)) + } else { + subcommand_result.map(|()| None) } }; - if let Err(e) = subcmd_result { - eprintln!("command failed: {:#}", e); - exit(1); - } + match subcommand_result { + Ok(Some(updated_env)) => updated_env.persist_config(&updated_env.base_data_dir)?, + Ok(None) => (), + Err(e) => { + eprintln!("command failed: {:?}", e); + exit(1); + } + } Ok(()) } /// -/// Prints branches list as a tree-like structure. +/// Prints timelines list as a tree-like structure. /// -fn print_branches_tree(branches: Vec) -> Result<()> { - let mut branches_hash: HashMap = HashMap::new(); +fn print_timelines_tree( + timelines: Vec, + mut timeline_name_mappings: HashMap, +) -> Result<()> { + let mut timelines_hash = timelines + .iter() + .map(|t| { + ( + t.timeline_id(), + TimelineTreeEl { + info: t.clone(), + children: BTreeSet::new(), + name: timeline_name_mappings + .remove(&ZTenantTimelineId::new(t.tenant_id(), t.timeline_id())), + }, + ) + }) + .collect::>(); - // Form a hash table of branch timeline_id -> BranchTreeEl. - for branch in &branches { - branches_hash.insert( - branch.timeline_id.to_string(), - BranchTreeEl { - info: branch.clone(), - children: Vec::new(), - }, - ); - } - - // Memorize all direct children of each branch. - for branch in &branches { - if let Some(tid) = &branch.ancestor_id { - branches_hash + // Memorize all direct children of each timeline. + for timeline in &timelines { + if let TimelineInfo::Local { + ancestor_timeline_id: Some(tid), + .. + } = timeline + { + timelines_hash .get_mut(tid) - .context("missing branch info in the HashMap")? + .context("missing timeline info in the HashMap")? .children - .push(branch.timeline_id.to_string()); + .insert(timeline.timeline_id()); } } - // Sort children by tid to bring some minimal order. - for branch in &mut branches_hash.values_mut() { - branch.children.sort(); - } - - for branch in branches_hash.values() { - // Start with root branches (no ancestors) first. - // Now there is 'main' branch only, but things may change. - if branch.info.ancestor_id.is_none() { - print_branch(0, &Vec::from([true]), branch, &branches_hash)?; + for timeline in timelines_hash.values() { + // Start with root local timelines (no ancestors) first. + if let TimelineInfo::Local { + ancestor_timeline_id, + .. + } = &timeline.info + { + if ancestor_timeline_id.is_none() { + print_timeline(0, &Vec::from([true]), timeline, &timelines_hash)?; + } } } @@ -298,27 +342,32 @@ fn print_branches_tree(branches: Vec) -> Result<()> { } /// -/// Recursively prints branch info with all its children. +/// Recursively prints timeline info with all its children. /// -fn print_branch( +fn print_timeline( nesting_level: usize, is_last: &[bool], - branch: &BranchTreeEl, - branches: &HashMap, + timeline: &TimelineTreeEl, + timelines: &HashMap, ) -> Result<()> { + let local_or_remote = match timeline.info { + TimelineInfo::Local { .. } => "(L)", + TimelineInfo::Remote { .. } => "(R)", + }; // Draw main padding - print!(" "); + print!("{} ", local_or_remote); if nesting_level > 0 { - let lsn = branch - .info - .ancestor_lsn - .as_ref() - .context("missing branch info in the HashMap")?; + let lsn_string = match &timeline.info { + TimelineInfo::Local { ancestor_lsn, .. } => ancestor_lsn + .map(|lsn| lsn.to_string()) + .unwrap_or_else(|| "Unknown local Lsn".to_string()), + TimelineInfo::Remote { .. } => "unknown Lsn (remote)".to_string(), + }; let mut br_sym = "┣━"; // Draw each nesting padding with proper style - // depending on whether its branch ended or not. + // depending on whether its timeline ended or not. if nesting_level > 1 { for l in &is_last[1..is_last.len() - 1] { if *l { @@ -329,73 +378,92 @@ fn print_branch( } } - // We are the last in this sub-branch + // We are the last in this sub-timeline if *is_last.last().unwrap() { br_sym = "┗━"; } - print!("{} @{}: ", br_sym, lsn); + print!("{} @{}: ", br_sym, lsn_string); } - // Finally print a branch name with new line - println!("{}", branch.info.name); + // Finally print a timeline id and name with new line + println!( + "{} [{}]", + timeline.name.as_deref().unwrap_or("_no_name_"), + timeline.info.timeline_id() + ); - let len = branch.children.len(); + let len = timeline.children.len(); let mut i: usize = 0; let mut is_last_new = Vec::from(is_last); is_last_new.push(false); - for child in &branch.children { + for child in &timeline.children { i += 1; - // Mark that the last padding is the end of the branch + // Mark that the last padding is the end of the timeline if i == len { if let Some(last) = is_last_new.last_mut() { *last = true; } } - print_branch( + print_timeline( nesting_level + 1, &is_last_new, - branches + timelines .get(child) - .context("missing branch info in the HashMap")?, - branches, + .context("missing timeline info in the HashMap")?, + timelines, )?; } Ok(()) } -/// Returns a map of timeline IDs to branch_name@lsn strings. +/// Returns a map of timeline IDs to timeline_id@lsn strings. /// Connects to the pageserver to query this information. -fn get_branch_infos( +fn get_timeline_infos( env: &local_env::LocalEnv, - tenantid: &ZTenantId, -) -> Result> { - let page_server = PageServerNode::from_env(env); - let branch_infos: Vec = page_server.branch_list(tenantid)?; - let branch_infos: HashMap = branch_infos + tenant_id: &ZTenantId, +) -> Result> { + Ok(PageServerNode::from_env(env) + .timeline_list(tenant_id)? .into_iter() - .map(|branch_info| (branch_info.timeline_id, branch_info)) - .collect(); - - Ok(branch_infos) + .map(|timeline_info| (timeline_info.timeline_id(), timeline_info)) + .collect()) } -// Helper function to parse --tenantid option, or get the default from config file -fn get_tenantid(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result { - if let Some(tenantid_cmd) = sub_match.value_of("tenantid") { - Ok(ZTenantId::from_str(tenantid_cmd)?) - } else if let Some(tenantid_conf) = env.default_tenantid { - Ok(tenantid_conf) +// Helper function to parse --tenant_id option, or get the default from config file +fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result { + if let Some(tenant_id_from_arguments) = parse_tenant_id(sub_match).transpose() { + tenant_id_from_arguments + } else if let Some(tenantid_conf) = env.default_tenant_id { + Ok(ZTenantId::from(tenantid_conf)) } else { - bail!("No tenantid. Use --tenantid, or set 'default_tenantid' in the config file"); + bail!("No tenant id. Use --tenant-id, or set 'default_tenant_id' in the config file"); } } -fn handle_init(init_match: &ArgMatches) -> Result<()> { +fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("tenant-id") + .map(ZTenantId::from_str) + .transpose() + .context("Failed to parse tenant id from the argument string") +} + +fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result> { + sub_match + .value_of("timeline-id") + .map(ZTimelineId::from_str) + .transpose() + .context("Failed to parse timeline id from the argument string") +} + +fn handle_init(init_match: &ArgMatches) -> Result { + let initial_timeline_id_arg = parse_timeline_id(init_match)?; + // Create config file let toml_file: String = if let Some(config_path) = init_match.value_of("config") { // load and parse the file @@ -411,18 +479,29 @@ fn handle_init(init_match: &ArgMatches) -> Result<()> { env.init() .context("Failed to initialize zenith repository")?; + // default_tenantid was generated by the `env.init()` call above + let initial_tenant_id = ZTenantId::from(env.default_tenant_id.unwrap()); + // Call 'pageserver init'. let pageserver = PageServerNode::from_env(&env); - if let Err(e) = pageserver.init( - // default_tenantid was generated by the `env.init()` call above - Some(&env.default_tenantid.unwrap().to_string()), - &pageserver_config_overrides(init_match), - ) { - eprintln!("pageserver init failed: {}", e); - exit(1); - } + let initial_timeline_id = pageserver + .init( + Some(initial_tenant_id), + initial_timeline_id_arg, + &pageserver_config_overrides(init_match), + ) + .unwrap_or_else(|e| { + eprintln!("pageserver init failed: {}", e); + exit(1); + }); - Ok(()) + env.register_branch_mapping( + DEFAULT_BRANCH_NAME.to_owned(), + initial_tenant_id, + initial_timeline_id, + )?; + + Ok(env) } fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { @@ -433,7 +512,7 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> { .collect() } -fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); match tenant_match.subcommand() { Some(("list", _)) => { @@ -442,13 +521,16 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result } } Some(("create", create_match)) => { - let tenantid = match create_match.value_of("tenantid") { - Some(tenantid) => ZTenantId::from_str(tenantid)?, - None => ZTenantId::generate(), - }; - println!("using tenant id {}", tenantid); - pageserver.tenant_create(tenantid)?; - println!("tenant successfully created on the pageserver"); + let initial_tenant_id = parse_tenant_id(create_match)?; + let new_tenant_id = pageserver + .tenant_create(initial_tenant_id)? + .ok_or_else(|| { + anyhow!("Tenant with id {:?} was already created", initial_tenant_id) + })?; + println!( + "tenant {} successfully created on the pageserver", + new_tenant_id + ); } Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), None => bail!("no tenant subcommand provided"), @@ -456,24 +538,94 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &local_env::LocalEnv) -> Result Ok(()) } -fn handle_branch(branch_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { +fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let pageserver = PageServerNode::from_env(env); - let tenantid = get_tenantid(branch_match, env)?; + match timeline_match.subcommand() { + Some(("list", list_match)) => { + let tenant_id = get_tenant_id(list_match, env)?; + let timelines = pageserver.timeline_list(&tenant_id)?; + print_timelines_tree(timelines, env.timeline_name_mappings())?; + } + Some(("create", create_match)) => { + let tenant_id = get_tenant_id(create_match, env)?; + let new_branch_name = create_match + .value_of("branch-name") + .ok_or(anyhow!("No branch name provided"))?; + let timeline = pageserver + .timeline_create(tenant_id, None, None, None)? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; + let new_timeline_id = timeline.timeline_id(); - if let Some(branchname) = branch_match.value_of("branchname") { - let startpoint_str = branch_match - .value_of("start-point") - .context("Missing start-point")?; - let branch = pageserver.branch_create(branchname, startpoint_str, &tenantid)?; - println!( - "Created branch '{}' at {:?} for tenant: {}", - branch.name, branch.latest_valid_lsn, tenantid, - ); - } else { - // No arguments, list branches for tenant - let branches = pageserver.branch_list(&tenantid)?; - print_branches_tree(branches)?; + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => { + bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ) + } + }; + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {} for tenant: {}", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ); + } + Some(("branch", branch_match)) => { + let tenant_id = get_tenant_id(branch_match, env)?; + let new_branch_name = branch_match + .value_of("branch-name") + .ok_or(anyhow!("No branch name provided"))?; + let ancestor_branch_name = branch_match + .value_of("ancestor-branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let ancestor_timeline_id = env + .get_branch_timeline_id(ancestor_branch_name, tenant_id) + .ok_or_else(|| { + anyhow!( + "Found no timeline id for branch name '{}'", + ancestor_branch_name + ) + })?; + + let start_lsn = branch_match + .value_of("ancestor-start-lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse ancestor start Lsn from the request")?; + let timeline = pageserver + .timeline_create(tenant_id, None, start_lsn, Some(ancestor_timeline_id))? + .ok_or_else(|| anyhow!("Failed to create new timeline for tenant {}", tenant_id))?; + let new_timeline_id = timeline.timeline_id(); + + let last_record_lsn = match timeline { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn, + TimelineInfo::Remote { .. } => bail!( + "Timeline {} was created as remote, not local", + new_timeline_id + ), + }; + + env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?; + + println!( + "Created timeline '{}' at Lsn {} for tenant: {}. Ancestor timeline: '{}'", + timeline.timeline_id(), + last_record_lsn, + tenant_id, + ancestor_branch_name, + ); + } + Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name), + None => bail!("no tenant subcommand provided"), } Ok(()) @@ -487,63 +639,90 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { let mut cplane = ComputeControlPlane::load(env.clone())?; - // All subcommands take an optional --tenantid option - let tenantid = get_tenantid(sub_args, env)?; + // All subcommands take an optional --tenant-id option + let tenant_id = get_tenant_id(sub_args, env)?; match sub_name { "list" => { - let branch_infos = get_branch_infos(env, &tenantid).unwrap_or_else(|e| { - eprintln!("Failed to load branch info: {}", e); + let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| { + eprintln!("Failed to load timeline info: {}", e); HashMap::new() }); - println!("NODE\tADDRESS\t\tBRANCH\tLSN\t\tSTATUS"); + let timeline_name_mappings = env.timeline_name_mappings(); + + println!("NODE\tADDRESS\tTIMELINE\tBRANCH NAME\tLSN\t\tSTATUS"); for ((_, node_name), node) in cplane .nodes .iter() - .filter(|((node_tenantid, _), _)| node_tenantid == &tenantid) + .filter(|((node_tenant_id, _), _)| node_tenant_id == &tenant_id) { // FIXME: This shows the LSN at the end of the timeline. It's not the // right thing to do for read-only nodes that might be anchored at an // older point in time, or following but lagging behind the primary. - let lsn_str = branch_infos - .get(&node.timelineid) - .map(|bi| bi.latest_valid_lsn.to_string()) - .unwrap_or_else(|| "?".to_string()); + let lsn_str = timeline_infos + .get(&node.timeline_id) + .map(|bi| match bi { + TimelineInfo::Local { + last_record_lsn, .. + } => last_record_lsn.to_string(), + TimelineInfo::Remote { .. } => "? (remote)".to_string(), + }) + .unwrap_or_else(|| '?'.to_string()); + + let branch_name = timeline_name_mappings + .get(&ZTenantTimelineId::new(tenant_id, node.timeline_id)) + .map(|name| name.as_str()) + .unwrap_or("?"); println!( - "{}\t{}\t{}\t{}\t{}", + "{}\t{}\t{}\t{}\t{}\t{}", node_name, node.address, - node.timelineid, // FIXME: resolve human-friendly branch name + node.timeline_id, + branch_name, lsn_str, node.status(), ); } } "create" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline").unwrap_or(node_name); + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let node_name = sub_args + .value_of("node") + .map(ToString::to_string) + .unwrap_or_else(|| format!("{}_node", branch_name)); + + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| anyhow!("Found no timeline id for branch name '{}'", branch_name))?; let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; - cplane.new_node(tenantid, node_name, timeline_name, port)?; + cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?; } "start" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); - let timeline_name = sub_args.value_of("timeline"); - let port: Option = match sub_args.value_of("port") { Some(p) => Some(p.parse()?), None => None, }; + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to start"))?; - let node = cplane.nodes.get(&(tenantid, node_name.to_owned())); + let node = cplane.nodes.get(&(tenant_id, node_name.to_owned())); let auth_token = if matches!(env.pageserver.auth_type, AuthType::ZenithJWT) { - let claims = Claims::new(Some(tenantid), Scope::Tenant); + let claims = Claims::new(Some(tenant_id), Scope::Tenant); Some(env.generate_auth_token(&claims)?) } else { @@ -551,40 +730,49 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> { }; if let Some(node) = node { - if timeline_name.is_some() { - println!("timeline name ignored because node exists already"); - } println!("Starting existing postgres {}...", node_name); node.start(&auth_token)?; } else { + let branch_name = sub_args + .value_of("branch-name") + .unwrap_or(DEFAULT_BRANCH_NAME); + let timeline_id = env + .get_branch_timeline_id(branch_name, tenant_id) + .ok_or_else(|| { + anyhow!("Found no timeline id for branch name '{}'", branch_name) + })?; + let lsn = sub_args + .value_of("lsn") + .map(Lsn::from_str) + .transpose() + .context("Failed to parse Lsn from the request")?; // when used with custom port this results in non obvious behaviour // port is remembered from first start command, i e // start --port X // stop // start <-- will also use port X even without explicit port argument - let timeline_name = timeline_name.unwrap_or(node_name); println!( - "Starting new postgres {} on {}...", - node_name, timeline_name + "Starting new postgres {} on timeline {} ...", + node_name, timeline_id ); - let node = cplane.new_node(tenantid, node_name, timeline_name, port)?; + let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?; node.start(&auth_token)?; } } "stop" => { - let node_name = sub_args.value_of("node").unwrap_or("main"); + let node_name = sub_args + .value_of("node") + .ok_or_else(|| anyhow!("No node name was provided to stop"))?; let destroy = sub_args.is_present("destroy"); let node = cplane .nodes - .get(&(tenantid, node_name.to_owned())) + .get(&(tenant_id, node_name.to_owned())) .with_context(|| format!("postgres {} is not found", node_name))?; node.stop(destroy)?; } - _ => { - bail!("Unexpected pg subcommand '{}'", sub_name) - } + _ => bail!("Unexpected pg subcommand '{}'", sub_name), } Ok(()) @@ -628,11 +816,11 @@ fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul Ok(()) } -fn get_safekeeper(env: &local_env::LocalEnv, name: &str) -> Result { - if let Some(node) = env.safekeepers.iter().find(|node| node.name == name) { +fn get_safekeeper(env: &local_env::LocalEnv, id: ZNodeId) -> Result { + if let Some(node) = env.safekeepers.iter().find(|node| node.id == id) { Ok(SafekeeperNode::from_env(env, node)) } else { - bail!("could not find safekeeper '{}'", name) + bail!("could not find safekeeper '{}'", id) } } @@ -643,8 +831,12 @@ fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Resul }; // All the commands take an optional safekeeper name argument - let node_name = sub_args.value_of("node").unwrap_or(DEFAULT_SAFEKEEPER_NAME); - let safekeeper = get_safekeeper(env, node_name)?; + let sk_id = if let Some(id_str) = sub_args.value_of("id") { + ZNodeId(id_str.parse().context("while parsing safekeeper id")?) + } else { + DEFAULT_SAFEKEEPER_ID + }; + let safekeeper = get_safekeeper(env, sk_id)?; match sub_name { "start" => { @@ -697,7 +889,7 @@ fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.start() { - eprintln!("safekeeper '{}' start failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' start failed: {}", safekeeper.id, e); exit(1); } } @@ -724,7 +916,7 @@ fn handle_stop_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result< for node in env.safekeepers.iter() { let safekeeper = SafekeeperNode::from_env(env, node); if let Err(e) = safekeeper.stop(immediate) { - eprintln!("safekeeper '{}' stop failed: {}", safekeeper.name, e); + eprintln!("safekeeper '{}' stop failed: {}", safekeeper.id, e); } } Ok(()) diff --git a/zenith_utils/Cargo.toml b/zenith_utils/Cargo.toml index b22fcbf748..daaf345f8f 100644 --- a/zenith_utils/Cargo.toml +++ b/zenith_utils/Cargo.toml @@ -37,3 +37,8 @@ bytes = "1.0.1" hex-literal = "0.3" tempfile = "3.2" webpki = "0.21" +criterion = "0.3" + +[[bench]] +name = "benchmarks" +harness = false diff --git a/zenith_utils/benches/benchmarks.rs b/zenith_utils/benches/benchmarks.rs new file mode 100644 index 0000000000..c945d5021c --- /dev/null +++ b/zenith_utils/benches/benchmarks.rs @@ -0,0 +1,22 @@ +#![allow(unused)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use zenith_utils::zid; + +pub fn bench_zid_stringify(c: &mut Criterion) { + // Can only use public methods. + let ztl = zid::ZTenantTimelineId::generate(); + + c.bench_function("zid.to_string", |b| { + b.iter(|| { + // FIXME measurement overhead? + //for _ in 0..1000 { + // ztl.tenant_id.to_string(); + //} + ztl.tenant_id.to_string(); + }) + }); +} + +criterion_group!(benches, bench_zid_stringify); +criterion_main!(benches); diff --git a/zenith_utils/src/auth.rs b/zenith_utils/src/auth.rs index 274dd13bee..cbc4fcee61 100644 --- a/zenith_utils/src/auth.rs +++ b/zenith_utils/src/auth.rs @@ -5,9 +5,7 @@ // The second one is that we wanted to use ed25519 keys, but they are also not supported until next version. So we go with RSA keys for now. // Relevant issue: https://github.com/Keats/jsonwebtoken/issues/162 -use hex::{self, FromHex}; -use serde::de::Error; -use serde::{self, Deserializer, Serializer}; +use serde; use std::fs; use std::path::Path; @@ -17,7 +15,7 @@ use jsonwebtoken::{ }; use serde::{Deserialize, Serialize}; -use crate::zid::ZTenantId; +use crate::zid::{HexZTenantId, ZTenantId}; const JWT_ALGORITHM: Algorithm = Algorithm::RS256; @@ -28,44 +26,18 @@ pub enum Scope { PageServerApi, } -pub fn to_hex_option(value: &Option, serializer: S) -> Result -where - S: Serializer, -{ - match value { - Some(tid) => hex::serialize(tid, serializer), - None => Option::serialize(value, serializer), - } -} - -fn from_hex_option<'de, D>(deserializer: D) -> Result, D::Error> -where - D: Deserializer<'de>, -{ - let opt: Option = Option::deserialize(deserializer)?; - match opt { - Some(tid) => Ok(Some(ZTenantId::from_hex(tid).map_err(Error::custom)?)), - None => Ok(None), - } -} - #[derive(Debug, Serialize, Deserialize, Clone)] pub struct Claims { - // this custom serialize/deserialize_with is needed because Option is not transparent to serde - // so clearest option is serde(with = "hex") but it is not working, for details see https://github.com/serde-rs/serde/issues/1301 - #[serde( - default, - skip_serializing_if = "Option::is_none", - serialize_with = "to_hex_option", - deserialize_with = "from_hex_option" - )] - pub tenant_id: Option, + pub tenant_id: Option, pub scope: Scope, } impl Claims { pub fn new(tenant_id: Option, scope: Scope) -> Self { - Self { tenant_id, scope } + Self { + tenant_id: tenant_id.map(HexZTenantId::from), + scope, + } } } @@ -75,7 +47,7 @@ pub fn check_permission(claims: &Claims, tenantid: Option) -> Result< bail!("Attempt to access management api with tenant scope. Permission denied") } (Scope::Tenant, Some(tenantid)) => { - if claims.tenant_id.unwrap() != tenantid { + if ZTenantId::from(claims.tenant_id.unwrap()) != tenantid { bail!("Tenant id mismatch. Permission denied") } Ok(()) diff --git a/zenith_utils/src/zid.rs b/zenith_utils/src/zid.rs index 2e93ab596c..e047e38da7 100644 --- a/zenith_utils/src/zid.rs +++ b/zenith_utils/src/zid.rs @@ -2,13 +2,100 @@ use std::{fmt, str::FromStr}; use hex::FromHex; use rand::Rng; -use serde::{Deserialize, Serialize}; +use serde::{ + de::{self, Visitor}, + Deserialize, Serialize, +}; -// Zenith ID is a 128-bit random ID. -// Used to represent various identifiers. Provides handy utility methods and impls. +macro_rules! mutual_from { + ($id1:ident, $id2:ident) => { + impl From<$id1> for $id2 { + fn from(id1: $id1) -> Self { + Self(id1.0.into()) + } + } + + impl From<$id2> for $id1 { + fn from(id2: $id2) -> Self { + Self(id2.0.into()) + } + } + }; +} + +/// Zenith ID is a 128-bit random ID. +/// Used to represent various identifiers. Provides handy utility methods and impls. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] struct ZId([u8; 16]); +/// [`ZId`] version that serializes and deserializes as a hex string. +/// Useful for various json serializations, where hex byte array from original id is not convenient. +/// +/// Plain `ZId` could be (de)serialized into hex string with `#[serde(with = "hex")]` attribute. +/// This however won't work on nested types like `Option` or `Vec`, see https://github.com/serde-rs/serde/issues/723 for the details. +/// Every separate type currently needs a new (de)serializing method for every type separately. +/// +/// To provide a generic way to serialize the ZId as a hex string where `#[serde(with = "hex")]` is not enough, this wrapper is created. +/// The default wrapper serialization is left unchanged due to +/// * byte array (de)serialization being faster and simpler +/// * byte deserialization being used in Safekeeper already, with those bytes coming from compute (see `ProposerGreeting` in safekeeper) +/// * current `HexZId`'s deserialization impl breaks on compute byte array deserialization, having it by default is dangerous +#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +struct HexZId([u8; 16]); + +impl Serialize for HexZId { + fn serialize(&self, ser: S) -> Result + where + S: serde::Serializer, + { + hex::encode(self.0).serialize(ser) + } +} + +impl<'de> Deserialize<'de> for HexZId { + fn deserialize(de: D) -> Result + where + D: serde::Deserializer<'de>, + { + de.deserialize_bytes(HexVisitor) + } +} + +struct HexVisitor; + +impl<'de> Visitor<'de> for HexVisitor { + type Value = HexZId; + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "A hexadecimal representation of a 128-bit random Zenith ID" + ) + } + + fn visit_bytes(self, hex_bytes: &[u8]) -> Result + where + E: de::Error, + { + ZId::from_hex(hex_bytes) + .map(HexZId::from) + .map_err(de::Error::custom) + } + + fn visit_str(self, hex_bytes_str: &str) -> Result + where + E: de::Error, + { + Self::visit_bytes(self, hex_bytes_str.as_bytes()) + } +} + +mutual_from!(ZId, HexZId); + impl ZId { pub fn get_from_buf(buf: &mut dyn bytes::Buf) -> ZId { let mut arr = [0u8; 16]; @@ -25,6 +112,17 @@ impl ZId { rand::thread_rng().fill(&mut tli_buf); ZId::from(tli_buf) } + + fn hex_encode(&self) -> String { + static HEX: &[u8] = b"0123456789abcdef"; + + let mut buf = vec![0u8; self.0.len() * 2]; + for (&b, chunk) in self.0.as_ref().iter().zip(buf.chunks_exact_mut(2)) { + chunk[0] = HEX[((b >> 4) & 0xf) as usize]; + chunk[1] = HEX[(b & 0xf) as usize]; + } + unsafe { String::from_utf8_unchecked(buf) } + } } impl FromStr for ZId { @@ -60,13 +158,13 @@ impl From<[u8; 16]> for ZId { impl fmt::Display for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } impl fmt::Debug for ZId { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(&hex::encode(self.0)) + f.write_str(&self.hex_encode()) } } @@ -155,46 +253,80 @@ macro_rules! zid_newtype { /// is separate from PostgreSQL timelines, and doesn't have those /// limitations. A zenith timeline is identified by a 128-bit ID, which /// is usually printed out as a hex string. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTimelineId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] pub struct ZTimelineId(ZId); -zid_newtype!(ZTimelineId); +/// A [`ZTimelineId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTimelineId(HexZId); -// Zenith Tenant Id represents identifiar of a particular tenant. -// Is used for distinguishing requests and data belonging to different users. +impl std::fmt::Debug for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl std::fmt::Display for HexZTimelineId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTimelineId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTimelineId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTimelineId::from(ZTimelineId::from_str(s)?)) + } +} + +zid_newtype!(ZTimelineId); +mutual_from!(ZTimelineId, HexZTimelineId); + +/// Zenith Tenant Id represents identifiar of a particular tenant. +/// Is used for distinguishing requests and data belonging to different users. +/// +/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look +/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`. +/// Use [`HexZTenantId`] to serialize it as hex string instead: `ad50847381e248feaac9876cc71ae418`. #[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)] pub struct ZTenantId(ZId); -zid_newtype!(ZTenantId); +/// A [`ZTenantId`] version that gets (de)serialized as a hex string. +/// Use in complex types, where `#[serde(with = "hex")]` does not work. +/// See [`HexZId`] for more details. +#[derive(Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd, Serialize, Deserialize)] +pub struct HexZTenantId(HexZId); -/// Serde routines for Option (de)serialization, using `T:Display` representations for inner values. -/// Useful for Option and Option to get their hex representations into serialized string and deserialize them back. -pub mod opt_display_serde { - use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; - use std::{fmt::Display, str::FromStr}; - - pub fn serialize(id: &Option, ser: S) -> Result - where - S: Serializer, - Id: Display, - { - id.as_ref().map(ToString::to_string).serialize(ser) - } - - pub fn deserialize<'de, D, Id>(des: D) -> Result, D::Error> - where - D: Deserializer<'de>, - Id: FromStr, - ::Err: Display, - { - Ok(if let Some(s) = Option::::deserialize(des)? { - Some(Id::from_str(&s).map_err(de::Error::custom)?) - } else { - None - }) +impl std::fmt::Debug for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) } } +impl std::fmt::Display for HexZTenantId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + ZTenantId::from(*self).fmt(f) + } +} + +impl FromStr for HexZTenantId { + type Err = ::Err; + + fn from_str(s: &str) -> Result { + Ok(HexZTenantId::from(ZTenantId::from_str(s)?)) + } +} + +zid_newtype!(ZTenantId); +mutual_from!(ZTenantId, HexZTenantId); + // A pair uniquely identifying Zenith instance. #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct ZTenantTimelineId { @@ -213,6 +345,10 @@ impl ZTenantTimelineId { pub fn generate() -> Self { Self::new(ZTenantId::generate(), ZTimelineId::generate()) } + + pub fn empty() -> Self { + Self::new(ZTenantId::from([0u8; 16]), ZTimelineId::from([0u8; 16])) + } } impl fmt::Display for ZTenantTimelineId { @@ -221,6 +357,18 @@ impl fmt::Display for ZTenantTimelineId { } } +// Unique ID of a storage node (safekeeper or pageserver). Supposed to be issued +// by the console. +#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Debug, Serialize, Deserialize)] +#[serde(transparent)] +pub struct ZNodeId(pub u64); + +impl fmt::Display for ZNodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + #[cfg(test)] mod tests { use std::fmt::Display; @@ -231,16 +379,15 @@ mod tests { #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)] struct TestStruct + Display> { - #[serde(with = "opt_display_serde")] field: Option, } #[test] fn test_hex_serializations_tenant_id() { let original_struct = TestStruct { - field: Some(ZTenantId::from_array(hex!( + field: Some(HexZTenantId::from(ZTenantId::from_array(hex!( "11223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -249,7 +396,7 @@ mod tests { r#"{"field":"11223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } @@ -257,9 +404,9 @@ mod tests { #[test] fn test_hex_serializations_timeline_id() { let original_struct = TestStruct { - field: Some(ZTimelineId::from_array(hex!( + field: Some(HexZTimelineId::from(ZTimelineId::from_array(hex!( "AA223344556677881122334455667788" - ))), + )))), }; let serialized_string = serde_json::to_string(&original_struct).unwrap(); @@ -268,7 +415,7 @@ mod tests { r#"{"field":"aa223344556677881122334455667788"}"# ); - let deserialized_struct: TestStruct = + let deserialized_struct: TestStruct = serde_json::from_str(&serialized_string).unwrap(); assert_eq!(original_struct, deserialized_struct); } From 1fddb0556f9e3dea86857f62fdc42b2d0db3d6d0 Mon Sep 17 00:00:00 2001 From: Andrey Taranik Date: Thu, 17 Mar 2022 00:01:17 +0300 Subject: [PATCH 44/55] deploy playbook fix - interaction with console (#1374) --- .circleci/ansible/deploy.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml index c95524a8a5..2dd109f99a 100644 --- a/.circleci/ansible/deploy.yaml +++ b/.circleci/ansible/deploy.yaml @@ -119,7 +119,7 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID tags: - pageserver @@ -169,6 +169,6 @@ shell: cmd: | INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) - curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID + curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID tags: - safekeeper From b0b2093d00141696f0abb0560dea638e625ab284 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 17 Mar 2022 13:14:33 +0200 Subject: [PATCH 45/55] Improve comments and tidy up the code in pgdatadir_mapping.rs. --- pageserver/src/basebackup.rs | 4 +- pageserver/src/pgdatadir_mapping.rs | 680 +++++++++++++++------------- pageserver/src/walreceiver.rs | 4 +- 3 files changed, 360 insertions(+), 328 deletions(-) diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 7882e7b2b2..0c4ca83272 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -71,7 +71,7 @@ impl<'a> Basebackup<'a> { // provide prev_lsn. (get_last_record_rlsn() might return it as // zero, though, if no WAL has been generated on this timeline // yet.) - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.tline.get_last_record_rlsn(); if req_lsn == end_of_timeline.last { (end_of_timeline.prev, req_lsn) } else { @@ -79,7 +79,7 @@ impl<'a> Basebackup<'a> { } } else { // Backup was requested at end of the timeline. - let end_of_timeline = timeline.get_last_record_rlsn(); + let end_of_timeline = timeline.tline.get_last_record_rlsn(); (end_of_timeline.prev, end_of_timeline.last) }; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index c4661ad2d6..e4c8e8884c 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,7 +6,6 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! - use crate::keyspace::{KeySpace, KeySpaceAccum, TARGET_FILE_SIZE_BYTES}; use crate::relish::*; use crate::repository::*; @@ -20,58 +19,34 @@ use std::collections::{HashMap, HashSet}; use std::ops::Range; use std::sync::atomic::{AtomicIsize, Ordering}; use std::sync::{Arc, RwLockReadGuard}; -use tracing::{debug, error, info, warn}; +use tracing::{debug, error, trace, warn}; use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::AtomicLsn; -use zenith_utils::lsn::{Lsn, RecordLsn}; +use zenith_utils::lsn::Lsn; -/// Block number within a relation or SRU. This matches PostgreSQL's BlockNumber type. +/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type. pub type BlockNumber = u32; pub struct DatadirTimeline where R: Repository, { + /// The underlying key-value store. Callers should not read or modify the + /// data in the underlying store directly. However, it is exposed to have + /// access to information like last-LSN, ancestor, and operations like + /// compaction. pub tline: Arc, - pub last_partitioning: AtomicLsn, - pub current_logical_size: AtomicIsize, - pub repartition_threshold: u64, + /// When did we last calculate the partitioning? + last_partitioning: AtomicLsn, + + /// Configuration: how often should the partitioning be recalculated. + repartition_threshold: u64, + + /// Current logical size of the "datadir", at the last LSN. + current_logical_size: AtomicIsize, } -#[derive(Debug, Serialize, Deserialize)] -pub struct DbDirectory { - // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) - dbdirs: HashMap<(Oid, Oid), bool>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct TwoPhaseDirectory { - xids: HashSet, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct RelDirectory { - // Set of relations that exist. (relfilenode, forknum) - // - // TODO: Store it as a btree or radix tree or something else that spans multiple - // key-value pairs, if you have a lot of relations - rels: HashSet<(Oid, u8)>, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct RelSizeEntry { - nblocks: u32, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct SlruSegmentDirectory { - // Set of SLRU segments that exist. - segments: HashSet, -} - -static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]); - impl DatadirTimeline { pub fn new(tline: Arc, repartition_threshold: u64) -> Self { DatadirTimeline { @@ -82,6 +57,9 @@ impl DatadirTimeline { } } + /// (Re-)calculate the logical size of the database at the latest LSN. + /// + /// This can be a slow operation. pub fn init_logical_size(&self) -> Result<()> { let last_lsn = self.tline.get_last_record_lsn(); self.current_logical_size.store( @@ -91,6 +69,31 @@ impl DatadirTimeline { Ok(()) } + /// Start updating a WAL record + /// + /// This provides a transaction-like interface to perform a bunch + /// of modifications atomically, with one LSN. + /// + /// To ingest a WAL record, call begin_record(lsn) to get a writer + /// object. Use the functions in the writer-object to modify the + /// repository state, updating all the pages and metadata that the + /// WAL record affects. When you're done, call writer.finish() to + /// commit the changes. + /// + /// Note that any pending modifications you make through the writer + /// won't be visible to calls to the get functions until you finish! + /// If you update the same page twice, the last update wins. + /// + pub fn begin_record(&self, lsn: Lsn) -> DatadirTimelineWriter { + DatadirTimelineWriter { + tline: self, + lsn, + pending_updates: HashMap::new(), + pending_deletions: Vec::new(), + pending_nblocks: 0, + } + } + //------------------------------------------------------------------------------ // Public GET functions //------------------------------------------------------------------------------ @@ -110,18 +113,6 @@ impl DatadirTimeline { self.tline.get(key, lsn) } - /// Look up given page version. - pub fn get_slru_page_at_lsn( - &self, - kind: SlruKind, - segno: u32, - blknum: BlockNumber, - lsn: Lsn, - ) -> Result { - let key = slru_block_to_key(kind, segno, blknum); - self.tline.get(key, lsn) - } - /// Get size of a relation file pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result { if (tag.forknum == pg_constants::FSM_FORKNUM @@ -140,6 +131,48 @@ impl DatadirTimeline { Ok(buf.get_u32_le()) } + /// Does relation exist? + pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { + // fetch directory listing + let key = rel_dir_to_key(tag.spcnode, tag.dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); + + Ok(exists) + } + + /// Get a list of all existing relations in given tablespace and database. + pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { + // fetch directory listing + let key = rel_dir_to_key(spcnode, dbnode); + let buf = self.tline.get(key, lsn)?; + let dir = RelDirectory::des(&buf)?; + + let rels: HashSet = + HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { + spcnode, + dbnode, + relnode: *relnode, + forknum: *forknum, + })); + + Ok(rels) + } + + /// Look up given SLRU page version. + pub fn get_slru_page_at_lsn( + &self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + lsn: Lsn, + ) -> Result { + let key = slru_block_to_key(kind, segno, blknum); + self.tline.get(key, lsn) + } + /// Get size of an SLRU segment pub fn get_slru_segment_size( &self, @@ -163,38 +196,6 @@ impl DatadirTimeline { Ok(exists) } - /// Does relation exist? - pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result { - // fetch directory listing - let key = rel_dir_to_key(tag.spcnode, tag.dbnode); - let buf = self.tline.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; - - let exists = dir.rels.get(&(tag.relnode, tag.forknum)).is_some(); - - info!("EXISTS: {} : {:?}", tag, exists); - - Ok(exists) - } - - /// Get a list of all existing relations in given tablespace and database. - pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result> { - // fetch directory listing - let key = rel_dir_to_key(spcnode, dbnode); - let buf = self.tline.get(key, lsn)?; - let dir = RelDirectory::des(&buf)?; - - let rels: HashSet = - HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag { - spcnode, - dbnode, - relnode: *relnode, - forknum: *forknum, - })); - - Ok(rels) - } - /// Get a list of SLRU segments pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result> { // fetch directory entry @@ -243,54 +244,18 @@ impl DatadirTimeline { self.tline.get(CHECKPOINT_KEY, lsn) } - //------------------------------------------------------------------------------ - // Public PUT functions, to update the repository with new page versions. - // - // These are called by the WAL receiver to digest WAL records. - //------------------------------------------------------------------------------ - - /// Atomically get both last and prev. - pub fn get_last_record_rlsn(&self) -> RecordLsn { - self.tline.get_last_record_rlsn() - } - - /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev. + /// Get the LSN of the last ingested WAL record. + /// + /// This is just a convenience wrapper that calls through to the underlying + /// repository. pub fn get_last_record_lsn(&self) -> Lsn { self.tline.get_last_record_lsn() } - pub fn get_prev_record_lsn(&self) -> Lsn { - self.tline.get_prev_record_lsn() - } - - pub fn get_disk_consistent_lsn(&self) -> Lsn { - self.tline.get_disk_consistent_lsn() - } - - /// This provides a "transaction-like" interface to updating the data - /// - /// To ingest a WAL record, call begin_record(lsn) to get a writer - /// object. Use the functions in the writer-object to modify the - /// repository state, updating all the pages and metadata that the - /// WAL record affects. When you're done, call writer.finish() to - /// commit the changes. - /// - /// Note that any pending modifications you make through the writer - /// won't be visible to calls to the get functions until you finish! - /// If you update the same page twice, the last update wins. - /// - pub fn begin_record(&self, lsn: Lsn) -> DatadirTimelineWriter { - DatadirTimelineWriter { - tline: self, - lsn, - pending_updates: HashMap::new(), - pending_deletions: Vec::new(), - pending_nblocks: 0, - } - } - - /// /// Check that it is valid to request operations with that lsn. + /// + /// This is just a convenience wrapper that calls through to the underlying + /// repository. pub fn check_lsn_is_in_scope( &self, lsn: Lsn, @@ -339,11 +304,15 @@ impl DatadirTimeline { Ok(total_size * pg_constants::BLCKSZ as usize) } + /// + /// Get a KeySpace that covers all the Keys that are in use at the given LSN. + /// Anything that's not listed maybe removed from the underlying storage (from + /// that LSN forwards). fn collect_keyspace(&self, lsn: Lsn) -> Result { // Iterate through key ranges, greedily packing them into partitions let mut result = KeySpaceAccum::new(); - // Add dbdir + // The dbdir metadata always exists result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them @@ -413,10 +382,18 @@ impl DatadirTimeline { } } +/// DatadirTimelineWriter represents an operation to ingest an atomic set of +/// updates to the repository. It is created by the 'begin_record' +/// function. It is called for each WAL record, so that all the modifications +/// by a one WAL record appear atomic pub struct DatadirTimelineWriter<'a, R: Repository> { tline: &'a DatadirTimeline, lsn: Lsn, + + // The modifications are not applied directly to the underyling key-value store. + // The put-functions add the modifications here, and they are flushed to the + // underlying key-value store by the 'finish' function. pending_updates: HashMap, pending_deletions: Vec>, pending_nblocks: isize, @@ -435,6 +412,10 @@ impl<'a, R: Repository> std::ops::Deref for DatadirTimelineWriter<'a, R> { /// Various functions to mutate the repository state. impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { + /// Initialize a completely new repository. + /// + /// This inserts the directory metadata entries that are assumed to + /// always exist. pub fn init_empty(&mut self) -> Result<()> { let buf = DbDirectory::ser(&DbDirectory { dbdirs: HashMap::new(), @@ -446,19 +427,14 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { })?; self.put(TWOPHASEDIR_KEY, Value::Image(buf.into())); - let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory { - segments: HashSet::new(), - })? - .into(); - self.put(slru_dir_to_key(SlruKind::Clog), Value::Image(buf.clone())); + let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into(); + let empty_dir = Value::Image(buf); + self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone()); self.put( slru_dir_to_key(SlruKind::MultiXactMembers), - Value::Image(buf.clone()), - ); - self.put( - slru_dir_to_key(SlruKind::MultiXactOffsets), - Value::Image(buf), + empty_dir.clone(), ); + self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir); Ok(()) } @@ -478,6 +454,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { Ok(()) } + // Same, but for an SLRU. pub fn put_slru_wal_record( &mut self, kind: SlruKind, @@ -589,42 +566,40 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { Ok(()) } - // When a new relish is created: + // When a new relation is created: // - create/update the directory entry to remember that it exists // - create relish header to indicate the size (0) - // When a relish is extended: + // When a relation is extended: // - update relish header with new size // - insert the block - // when a relish is truncated: + // when a relation is truncated: // - delete truncated blocks // - update relish header with size + /// Create a relation fork. + /// + /// 'nblocks' is the initial size. pub fn put_rel_creation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - // It's possible that this is the first rel for this db in this tablespace. - // Create the reldir entry for it if so. - let buf = self.get(DBDIR_KEY)?; - let mut dbdir = DbDirectory::des(&buf)?; - + // It's possible that this is the first rel for this db in this + // tablespace. Create the reldir entry for it if so. + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY)?)?; let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let mut rel_dir; - if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { - // update dbdir + let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() { + // Didn't exist. Update dbdir dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false); let buf = DbDirectory::ser(&dbdir)?; self.put(DBDIR_KEY, Value::Image(buf.into())); - // Create RelDirectory - rel_dir = RelDirectory { - rels: HashSet::new(), - }; + // and create the RelDirectory + RelDirectory::default() } else { - let buf = self.get(rel_dir_key)?; - rel_dir = RelDirectory::des(&buf)?; - } + // reldir already exists, fetch it + RelDirectory::des(&self.get(rel_dir_key)?)? + }; - // Add it to the directory entry + // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { bail!("rel {} already exists", rel); } @@ -648,15 +623,54 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { /// Truncate relation pub fn put_rel_truncation(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - // Put size let size_key = rel_size_to_key(rel); + // Fetch the old size first + let old_size = self.get(size_key)?.get_u32_le(); + + // Update the entry with the new size. + let buf = nblocks.to_le_bytes(); + self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); + + // Update logical database size. + self.pending_nblocks -= old_size as isize - nblocks as isize; + Ok(()) + } + + /// Extend relation + pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { + // Put size + let size_key = rel_size_to_key(rel); let old_size = self.get(size_key)?.get_u32_le(); let buf = nblocks.to_le_bytes(); self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - self.pending_nblocks -= old_size as isize - nblocks as isize; + self.pending_nblocks += nblocks as isize - old_size as isize; + Ok(()) + } + + /// Drop a relation. + pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { + // Remove it from the directory entry + let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); + let buf = self.get(dir_key)?; + let mut dir = RelDirectory::des(&buf)?; + + if dir.rels.remove(&(rel.relnode, rel.forknum)) { + self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); + } else { + warn!("dropped rel {} did not exist in rel directory", rel); + } + + // update logical size + let size_key = rel_size_to_key(rel); + let old_size = self.get(size_key)?.get_u32_le(); + self.pending_nblocks -= old_size as isize; + + // Delete size entry, as well as all blocks + self.delete(rel_key_range(rel)); + Ok(()) } @@ -703,50 +717,6 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { Ok(()) } - /// Extend relation - pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> { - // Put size - let size_key = rel_size_to_key(rel); - - let old_size = self.get(size_key)?.get_u32_le(); - - let buf = nblocks.to_le_bytes(); - self.put(size_key, Value::Image(Bytes::from(buf.to_vec()))); - - self.pending_nblocks += nblocks as isize - old_size as isize; - Ok(()) - } - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - pub fn put_rel_drop(&mut self, rel: RelTag) -> Result<()> { - // Remove it from the directory entry - let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode); - let buf = self.get(dir_key)?; - let mut dir = RelDirectory::des(&buf)?; - - if dir.rels.remove(&(rel.relnode, rel.forknum)) { - self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?))); - } else { - warn!("dropped rel {} did not exist in rel directory", rel); - } - - // update logical size - let size_key = rel_size_to_key(rel); - let old_size = self.get(size_key)?.get_u32_le(); - self.pending_nblocks -= old_size as isize; - - // Delete size entry, as well as all blocks - self.delete(rel_key_range(rel)); - - Ok(()) - } - - /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { - // TODO - Ok(()) - } - /// This method is used for marking truncated SLRU files pub fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> Result<()> { // Remove it from the directory entry @@ -768,6 +738,12 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { Ok(()) } + /// This method is used for marking dropped relations and truncated SLRU files and aborted two phase records + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<()> { + // TODO + Ok(()) + } + /// This method is used for marking truncated SLRU files pub fn drop_twophase_file(&mut self, xid: TransactionId) -> Result<()> { // Remove it from the directory entry @@ -804,7 +780,7 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { writer.finish_write(self.lsn); if last_partitioning == Lsn(0) - || self.lsn.0 - last_partitioning.0 > TARGET_FILE_SIZE_BYTES / 8 + || self.lsn.0 - last_partitioning.0 > self.tline.repartition_threshold { let keyspace = self.tline.collect_keyspace(self.lsn)?; let partitioning = keyspace.partition(TARGET_FILE_SIZE_BYTES); @@ -825,9 +801,11 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // Internal helper functions to batch the modifications fn get(&self, key: Key) -> Result { - // Note: we don't check pending_deletions. It is an error to request a value - // that has been removed, deletion only avoids leaking storage. - + // Have we already updated the same key? Read the pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. if let Some(value) = self.pending_updates.get(&key) { if let Value::Image(img) = value { Ok(img.clone()) @@ -850,42 +828,86 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { } fn delete(&mut self, key_range: Range) { - info!("DELETE {}-{}", key_range.start, key_range.end); + trace!("DELETE {}-{}", key_range.start, key_range.end); self.pending_deletions.push(key_range); } } -// Utilities to pack stuff in Key +//--- Metadata structs stored in key-value pairs in the repository. +#[derive(Debug, Serialize, Deserialize)] +struct DbDirectory { + // (spcnode, dbnode) -> (do relmapper and PG_VERSION files exist) + dbdirs: HashMap<(Oid, Oid), bool>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct TwoPhaseDirectory { + xids: HashSet, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct RelDirectory { + // Set of relations that exist. (relfilenode, forknum) + // + // TODO: Store it as a btree or radix tree or something else that spans multiple + // key-value pairs, if you have a lot of relations + rels: HashSet<(Oid, u8)>, +} + +#[derive(Debug, Serialize, Deserialize)] +struct RelSizeEntry { + nblocks: u32, +} + +#[derive(Debug, Serialize, Deserialize, Default)] +struct SlruSegmentDirectory { + // Set of SLRU segments that exist. + segments: HashSet, +} + +static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; pg_constants::BLCKSZ as usize]); + +// Layout of the Key address space // -// Key space: +// The Key struct, used to address the underlying key-value store, consists of +// 18 bytes, split into six fields. See 'Key' in repository.rs. We need to map +// all the data and metadata keys into those 18 bytes. // -// blocky stuff: relations and SLRUs +// Principles for the mapping: // -// DbDir () -> (dbnode, spcnode) +// - Things that are often accessed or modified together, should be close to +// each other in the key space. For example, if a relation is extended by one +// block, we create a new key-value pair for the block data, and update the +// relation size entry. Because of that, the RelSize key comes after all the +// RelBlocks of a relation: the RelSize and the last RelBlock are always next +// to each other. // +// The key space is divided into four major sections, identified by the first +// byte, and the form a hierarchy: +// +// 00 Relation data and metadata +// +// DbDir () -> (dbnode, spcnode) // Filenodemap -// // RelDir -> relnode forknum -// // RelBlocks -// // RelSize // -// Slrus -// -// SlruDir kind +// 01 SLRUs // +// SlruDir kind // SlruSegBlocks segno -// // SlruSegSize // -// pg_twophase +// 02 pg_twophase // -// controlfile -// checkpoint +// 03 misc +// controlfile +// checkpoint +// +// Below is a full list of the keyspace allocation: // - // DbDir: // 00 00000000 00000000 00000000 00 00000000 // @@ -922,6 +944,8 @@ impl<'a, R: Repository> DatadirTimelineWriter<'a, R> { // Checkpoint: // 03 00000000 00000000 00000000 00 00000001 +//-- Section 01: relation data and metadata + const DBDIR_KEY: Key = Key { field1: 0x00, field2: 0, @@ -931,45 +955,36 @@ const DBDIR_KEY: Key = Key { field6: 0, }; -const TWOPHASEDIR_KEY: Key = Key { - field1: 0x02, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -const CONTROLFILE_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 0, -}; - -const CHECKPOINT_KEY: Key = Key { - field1: 0x03, - field2: 0, - field3: 0, - field4: 0, - field5: 0, - field6: 1, -}; - -pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { +fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { Key { field1: 0x00, - field2: rel.spcnode, - field3: rel.dbnode, - field4: rel.relnode, - field5: rel.forknum, - field6: blknum, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + }..Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0xffffffff, + field5: 0xff, + field6: 0xffffffff, } } -pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { +fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { + Key { + field1: 0x00, + field2: spcnode, + field3: dbnode, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { Key { field1: 0x00, field2: spcnode, @@ -980,7 +995,18 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key { } } -pub fn rel_size_to_key(rel: RelTag) -> Key { +fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key { + Key { + field1: 0x00, + field2: rel.spcnode, + field3: rel.dbnode, + field4: rel.relnode, + field5: rel.forknum, + field6: blknum, + } +} + +fn rel_size_to_key(rel: RelTag) -> Key { Key { field1: 0x00, field2: rel.spcnode, @@ -991,37 +1017,7 @@ pub fn rel_size_to_key(rel: RelTag) -> Key { } } -pub fn slru_dir_to_key(kind: SlruKind) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 0, - field4: 0, - field5: 0, - field6: 0, - } -} - -pub fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { - Key { - field1: 0x01, - field2: match kind { - SlruKind::Clog => 0x00, - SlruKind::MultiXactMembers => 0x01, - SlruKind::MultiXactOffsets => 0x02, - }, - field3: 1, - field4: segno, - field5: 0, - field6: blknum, - } -} - -pub fn rel_key_range(rel: RelTag) -> Range { +fn rel_key_range(rel: RelTag) -> Range { Key { field1: 0x00, field2: rel.spcnode, @@ -1039,7 +1035,39 @@ pub fn rel_key_range(rel: RelTag) -> Range { } } -pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { +//-- Section 02: SLRUs + +fn slru_dir_to_key(kind: SlruKind) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 0, + field4: 0, + field5: 0, + field6: 0, + } +} + +fn slru_block_to_key(kind: SlruKind, segno: u32, blknum: BlockNumber) -> Key { + Key { + field1: 0x01, + field2: match kind { + SlruKind::Clog => 0x00, + SlruKind::MultiXactMembers => 0x01, + SlruKind::MultiXactOffsets => 0x02, + }, + field3: 1, + field4: segno, + field5: 0, + field6: blknum, + } +} + +fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { Key { field1: 0x01, field2: match kind { @@ -1054,7 +1082,7 @@ pub fn slru_segment_size_to_key(kind: SlruKind, segno: u32) -> Key { } } -pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { +fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { let field2 = match kind { SlruKind::Clog => 0x00, SlruKind::MultiXactMembers => 0x01, @@ -1078,18 +1106,18 @@ pub fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range { } } -pub fn relmap_file_key(spcnode: Oid, dbnode: Oid) -> Key { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - } -} +//-- Section 03: pg_twophase -pub fn twophase_file_key(xid: TransactionId) -> Key { +const TWOPHASEDIR_KEY: Key = Key { + field1: 0x02, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +fn twophase_file_key(xid: TransactionId) -> Key { Key { field1: 0x02, field2: 0, @@ -1100,7 +1128,7 @@ pub fn twophase_file_key(xid: TransactionId) -> Key { } } -pub fn twophase_key_range(xid: TransactionId) -> Range { +fn twophase_key_range(xid: TransactionId) -> Range { let (next_xid, overflowed) = xid.overflowing_add(1); Key { @@ -1120,6 +1148,28 @@ pub fn twophase_key_range(xid: TransactionId) -> Range { } } +//-- Section 03: Control file +const CONTROLFILE_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 0, +}; + +const CHECKPOINT_KEY: Key = Key { + field1: 0x03, + field2: 0, + field3: 0, + field4: 0, + field5: 0, + field6: 1, +}; + +// Reverse mappings for a few Keys. +// These are needed by WAL redo manager. + pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> { Ok(match key.field1 { 0x00 => ( @@ -1153,27 +1203,9 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> { }) } -pub fn dbdir_key_range(spcnode: Oid, dbnode: Oid) -> Range { - Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0, - field5: 0, - field6: 0, - }..Key { - field1: 0x00, - field2: spcnode, - field3: dbnode, - field4: 0xffffffff, - field5: 0xff, - field6: 0xffffffff, - } -} - -/// -/// Tests that should work the same with any Repository/Timeline implementation. -/// +// +//-- Tests that should work the same with any Repository/Timeline implementation. +// #[cfg(test)] pub fn create_test_timeline( diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/walreceiver.rs index 993768fbac..b57d498c7c 100644 --- a/pageserver/src/walreceiver.rs +++ b/pageserver/src/walreceiver.rs @@ -6,7 +6,7 @@ //! We keep one WAL receiver active per timeline. use crate::config::PageServerConf; -use crate::repository::Repository; +use crate::repository::{Repository, Timeline}; use crate::tenant_mgr; use crate::thread_mgr; use crate::thread_mgr::ThreadKind; @@ -303,7 +303,7 @@ fn walreceiver_main( // The last LSN we processed. It is not guaranteed to survive pageserver crash. let write_lsn = u64::from(last_lsn); // `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data - let flush_lsn = u64::from(timeline.get_disk_consistent_lsn()); + let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn()); // The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash // Used by safekeepers to remove WAL preceding `remote_consistent_lsn`. let apply_lsn = u64::from(timeline_synced_disk_consistent_lsn); From a7544eead59b4039ce18fcfc226b9e175f6521ed Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Thu, 17 Mar 2022 16:46:58 +0300 Subject: [PATCH 46/55] Remove the last non-borrowed string from `BeMessage` (#1376) --- proxy/src/auth.rs | 4 ++-- zenith_utils/src/pq_proto.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 5e6357fe80..e8fe65c081 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -200,7 +200,7 @@ async fn handle_new_user( client .write_message_noflush(&Be::AuthenticationOk)? .write_message_noflush(&BeParameterStatusMessage::encoding())? - .write_message(&Be::NoticeResponse(greeting)) + .write_message(&Be::NoticeResponse(&greeting)) .await?; // Wait for web console response (see `mgmt`) @@ -208,7 +208,7 @@ async fn handle_new_user( }) .await?; - client.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?; + client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?; Ok(db_info) } diff --git a/zenith_utils/src/pq_proto.rs b/zenith_utils/src/pq_proto.rs index 355b38fc95..cb69418c07 100644 --- a/zenith_utils/src/pq_proto.rs +++ b/zenith_utils/src/pq_proto.rs @@ -425,7 +425,7 @@ pub enum BeMessage<'a> { ReadyForQuery, RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), - NoticeResponse(String), + NoticeResponse(&'a str), KeepAlive(WalSndKeepAlive), } From 3da14d56f2901dcd2352d171e70a1e9a405609ed Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 17 Mar 2022 17:12:13 +0200 Subject: [PATCH 47/55] Fix materialized page caching. --- pageserver/src/layered_repository.rs | 60 ++++++++++++++++++++-------- pageserver/src/page_cache.rs | 18 ++++----- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 99671c350a..6d962b0cc7 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -35,6 +35,7 @@ use std::time::Instant; use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME}; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; +use crate::page_cache; use crate::remote_storage::{schedule_timeline_checkpoint_upload, schedule_timeline_download}; use crate::repository::{ GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncState, TimelineWriter, @@ -876,9 +877,25 @@ impl Timeline for LayeredTimeline { fn get(&self, key: Key, lsn: Lsn) -> Result { debug_assert!(lsn <= self.get_last_record_lsn()); + // Check the page cache. We will get back the most recent page with lsn <= `lsn`. + // The cached image can be returned directly if there is no WAL between the cached image + // and requested LSN. The cached image can also be used to reduce the amount of WAL needed + // for redo. + let cached_page_img = match self.lookup_cached_page(&key, lsn) { + Some((cached_lsn, cached_img)) => { + match cached_lsn.cmp(&lsn) { + Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check + Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image + Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn + } + Some((cached_lsn, cached_img)) + } + None => None, + }; + let mut reconstruct_state = ValueReconstructState { records: Vec::new(), - img: None, // FIXME: check page cache and put the img here + img: cached_page_img, }; self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?; @@ -1246,6 +1263,21 @@ impl LayeredTimeline { } } + fn lookup_cached_page(&self, key: &Key, lsn: Lsn) -> Option<(Lsn, Bytes)> { + let cache = page_cache::get(); + + // FIXME: It's pointless to check the cache for things that are not 8kB pages. + // We should look at the key to determine if it's a cacheable object + let (lsn, read_guard) = cache.lookup_materialized_page( + self.tenantid, + self.timelineid, + key, + lsn, + )?; + let img = Bytes::from(read_guard.to_vec()); + Some((lsn, img)) + } + fn get_ancestor_timeline(&self) -> Result> { let ancestor_entry = self .ancestor_timeline @@ -1962,26 +1994,22 @@ impl LayeredTimeline { None }; - //let last_rec_lsn = data.records.last().unwrap().0; + let last_rec_lsn = data.records.last().unwrap().0; let img = self.walredo_mgr .request_redo(key, request_lsn, base_img, data.records)?; - // FIXME: page caching - /* - if let RelishTag::Relation(rel_tag) = &rel { - let cache = page_cache::get(); - cache.memorize_materialized_page( - self.tenantid, - self.timelineid, - *rel_tag, - rel_blknum, - last_rec_lsn, - &img, - ); - } - */ + if img.len() == page_cache::PAGE_SZ { + let cache = page_cache::get(); + cache.memorize_materialized_page( + self.tenantid, + self.timelineid, + key, + last_rec_lsn, + &img, + ); + } Ok(img) } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index b0c8d3a5d7..bedabf2749 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -53,7 +53,8 @@ use zenith_utils::{ }; use crate::layered_repository::writeback_ephemeral_file; -use crate::{config::PageServerConf, relish::RelTag}; +use crate::config::PageServerConf; +use crate::repository::Key; static PAGE_CACHE: OnceCell = OnceCell::new(); const TEST_PAGE_CACHE_SIZE: usize = 10; @@ -108,8 +109,7 @@ enum CacheKey { struct MaterializedPageHashKey { tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: Key, } #[derive(Clone)] @@ -294,16 +294,14 @@ impl PageCache { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: &Key, lsn: Lsn, ) -> Option<(Lsn, PageReadGuard)> { let mut cache_key = CacheKey::MaterializedPage { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key: *key, }, lsn, }; @@ -326,8 +324,7 @@ impl PageCache { &self, tenant_id: ZTenantId, timeline_id: ZTimelineId, - rel_tag: RelTag, - blknum: u32, + key: Key, lsn: Lsn, img: &[u8], ) { @@ -335,8 +332,7 @@ impl PageCache { hash_key: MaterializedPageHashKey { tenant_id, timeline_id, - rel_tag, - blknum, + key, }, lsn, }; From 80fc1338339f6c7533234466a8637fc1069061c9 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 17 Mar 2022 15:47:55 +0200 Subject: [PATCH 48/55] Add sequential scan tests --- ...est_small_seqscans.py => test_seqscans.py} | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) rename test_runner/performance/{test_small_seqscans.py => test_seqscans.py} (65%) diff --git a/test_runner/performance/test_small_seqscans.py b/test_runner/performance/test_seqscans.py similarity index 65% rename from test_runner/performance/test_small_seqscans.py rename to test_runner/performance/test_seqscans.py index b98018ad97..35c723d0c1 100644 --- a/test_runner/performance/test_small_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -1,8 +1,5 @@ # Test sequential scan speed # -# The test table is large enough (3-4 MB) that it doesn't fit in the compute node -# cache, so the seqscans go to the page server. But small enough that it fits -# into memory in the page server. from contextlib import closing from dataclasses import dataclass from fixtures.zenith_fixtures import ZenithEnv @@ -12,11 +9,16 @@ from fixtures.compare_fixtures import PgCompare import pytest -@pytest.mark.parametrize('rows', [ - pytest.param(100000), - pytest.param(1000000, marks=pytest.mark.slow), +@pytest.mark.parametrize('rows,iters,workers', [ + # The test table is large enough (3-4 MB) that it doesn't fit in the compute node + # cache, so the seqscans go to the page server. But small enough that it fits + # into memory in the page server. + pytest.param(100000, 100, 0), + # Also test with a larger table, with and without parallelism + pytest.param(10000000, 1, 0, marks=pytest.mark.slow), + pytest.param(10000000, 1, 4, marks=pytest.mark.slow) ]) -def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int): +def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int): env = zenith_with_baseline with closing(env.pg.connect()) as conn: @@ -36,6 +38,8 @@ def test_small_seqscans(zenith_with_baseline: PgCompare, rows: int): assert int(shared_buffers) < int(table_size) env.zenbenchmark.record("table_size", table_size, 'bytes', MetricReport.TEST_PARAM) + cur.execute(f"set max_parallel_workers_per_gather = {workers}") + with env.record_duration('run'): - for i in range(1000): + for i in range(iters): cur.execute('select count(*) from t;') From 13ec0ce7b26a318099f19a91c8dcdb4ac0972064 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Thu, 17 Mar 2022 19:40:08 +0200 Subject: [PATCH 49/55] fix formatting --- pageserver/src/layered_repository.rs | 8 ++------ pageserver/src/page_cache.rs | 2 +- test_runner/performance/test_seqscans.py | 20 +++++++++++--------- 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 6d962b0cc7..ea00c7beb4 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1268,12 +1268,8 @@ impl LayeredTimeline { // FIXME: It's pointless to check the cache for things that are not 8kB pages. // We should look at the key to determine if it's a cacheable object - let (lsn, read_guard) = cache.lookup_materialized_page( - self.tenantid, - self.timelineid, - key, - lsn, - )?; + let (lsn, read_guard) = + cache.lookup_materialized_page(self.tenantid, self.timelineid, key, lsn)?; let img = Bytes::from(read_guard.to_vec()); Some((lsn, img)) } diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs index bedabf2749..90ab5622bd 100644 --- a/pageserver/src/page_cache.rs +++ b/pageserver/src/page_cache.rs @@ -52,8 +52,8 @@ use zenith_utils::{ zid::{ZTenantId, ZTimelineId}, }; -use crate::layered_repository::writeback_ephemeral_file; use crate::config::PageServerConf; +use crate::layered_repository::writeback_ephemeral_file; use crate::repository::Key; static PAGE_CACHE: OnceCell = OnceCell::new(); diff --git a/test_runner/performance/test_seqscans.py b/test_runner/performance/test_seqscans.py index 35c723d0c1..f4d28e3f93 100644 --- a/test_runner/performance/test_seqscans.py +++ b/test_runner/performance/test_seqscans.py @@ -9,15 +9,17 @@ from fixtures.compare_fixtures import PgCompare import pytest -@pytest.mark.parametrize('rows,iters,workers', [ - # The test table is large enough (3-4 MB) that it doesn't fit in the compute node - # cache, so the seqscans go to the page server. But small enough that it fits - # into memory in the page server. - pytest.param(100000, 100, 0), - # Also test with a larger table, with and without parallelism - pytest.param(10000000, 1, 0, marks=pytest.mark.slow), - pytest.param(10000000, 1, 4, marks=pytest.mark.slow) -]) +@pytest.mark.parametrize( + 'rows,iters,workers', + [ + # The test table is large enough (3-4 MB) that it doesn't fit in the compute node + # cache, so the seqscans go to the page server. But small enough that it fits + # into memory in the page server. + pytest.param(100000, 100, 0), + # Also test with a larger table, with and without parallelism + pytest.param(10000000, 1, 0, marks=pytest.mark.slow), + pytest.param(10000000, 1, 4, marks=pytest.mark.slow) + ]) def test_seqscans(zenith_with_baseline: PgCompare, rows: int, iters: int, workers: int): env = zenith_with_baseline From d383ed4e680ece6087681b0afc85082baed9b91c Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 11:40:49 +0200 Subject: [PATCH 50/55] Add missing fsyncs --- pageserver/src/layered_repository.rs | 72 ++++++++++++++++++---------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index ea00c7beb4..9b655fcf17 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1450,39 +1450,44 @@ impl LayeredTimeline { Ok(()) } + /// Flush one frozen in-memory layer to disk, as a new delta layer. fn flush_frozen_layer(&self, frozen_layer: Arc) -> Result<()> { - // Do we have a frozen in-memory layer that we need to write out? let new_delta = frozen_layer.write_to_disk()?; + let new_delta_path = new_delta.path(); + + // Sync the new layer to disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // TODO: If we're running inside 'flush_frozen_layers' and there are multiple + // files to flush, it might be better to first write them all, and then fsync + // them all in parallel. + par_fsync::par_fsync(&[ + new_delta_path.clone(), + self.conf.timeline_path(&self.timelineid, &self.tenantid), + ])?; // Finally, replace the frozen in-memory layer with the new on-disk layers - let mut layers = self.layers.lock().unwrap(); - let l = layers.frozen_layers.pop_front(); + { + let mut layers = self.layers.lock().unwrap(); + let l = layers.frozen_layers.pop_front(); - // Only one thread may call this function at a time (for this - // timeline). If two threads tried to flush the same frozen - // layer to disk at the same time, that would not work. - assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); + // Only one thread may call this function at a time (for this + // timeline). If two threads tried to flush the same frozen + // layer to disk at the same time, that would not work. + assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer)); - // Add the new delta layer to the LayerMap - let mut layer_paths = vec![new_delta.path()]; - layers.insert_historic(Arc::new(new_delta)); + // Add the new delta layer to the LayerMap + layers.insert_historic(Arc::new(new_delta)); - drop(layers); - - // Sync layers - if !layer_paths.is_empty() { - // We must fsync the timeline dir to ensure the directory entries for - // new layer files are durable - layer_paths.push(self.conf.timeline_path(&self.timelineid, &self.tenantid)); - - // Fsync all the layer files and directory using multiple threads to - // minimize latency. - par_fsync::par_fsync(&layer_paths)?; - - layer_paths.pop().unwrap(); + // release lock on 'layers' } - // Compute new 'disk_consistent_lsn' + // Update the metadata file, with new 'disk_consistent_lsn' + // + // TODO: This perhaps should be done in 'flush_frozen_layers', after flushing + // *all* the layers, to avoid fsyncing the file multiple times. let disk_consistent_lsn; disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1); @@ -1538,7 +1543,7 @@ impl LayeredTimeline { schedule_timeline_checkpoint_upload( self.tenantid, self.timelineid, - layer_paths, + vec![new_delta_path], metadata, ); } @@ -1678,10 +1683,25 @@ impl LayeredTimeline { } let image_layer = image_layer_writer.finish()?; + // Sync the new layer to disk before adding it to the layer map, to make sure + // we don't garbage collect something based on the new layer, before it has + // reached the disk. + // + // We must also fsync the timeline dir to ensure the directory entries for + // new layer files are durable + // + // Compaction creates multiple image layers. It would be better to create them all + // and fsync them all in parallel. + par_fsync::par_fsync(&[ + image_layer.path(), + self.conf.timeline_path(&self.timelineid, &self.tenantid), + ])?; + + // FIXME: Do we need to do something to upload it to remote storage here? + let mut layers = self.layers.lock().unwrap(); layers.insert_historic(Arc::new(image_layer)); drop(layers); - // FIXME: need to fsync? Ok(()) } From 12141523f667d293b8aaf694535fd71d067826c3 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 11:45:45 +0200 Subject: [PATCH 51/55] Improve comments --- pageserver/src/layered_repository.rs | 18 +++++++++++++----- pageserver/src/relish.rs | 27 --------------------------- pageserver/src/thread_mgr.rs | 2 +- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 9b655fcf17..53dfd371e1 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -1815,8 +1815,13 @@ impl LayeredTimeline { Ok(()) } + /// Update information about which layer files need to be retained on + /// garbage collection. This is separate from actually performing the GC, + /// and is updated more frequently, so that compaction can remove obsolete + /// page versions more aggressively. /// - /// Garbage collect layer files on a timeline that are no longer needed. + /// TODO: that's wishful thinking, compaction doesn't actually do that + /// currently. /// /// The caller specifies how much history is needed with the two arguments: /// @@ -1833,16 +1838,19 @@ impl LayeredTimeline { /// the latest LSN subtracted by a constant, and doesn't do anything smart /// to figure out what read-only nodes might actually need.) /// - /// Currently, we don't make any attempt at removing unneeded page versions - /// within a layer file. We can only remove the whole file if it's fully - /// obsolete. - /// fn update_gc_info(&self, retain_lsns: Vec, cutoff: Lsn) { let mut gc_info = self.gc_info.write().unwrap(); gc_info.retain_lsns = retain_lsns; gc_info.cutoff = cutoff; } + /// + /// Garbage collect layer files on a timeline that are no longer needed. + /// + /// Currently, we don't make any attempt at removing unneeded page versions + /// within a layer file. We can only remove the whole file if it's fully + /// obsolete. + /// fn gc(&self) -> Result { let now = Instant::now(); let mut result: GcResult = Default::default(); diff --git a/pageserver/src/relish.rs b/pageserver/src/relish.rs index 521e07e50f..46ff468f2f 100644 --- a/pageserver/src/relish.rs +++ b/pageserver/src/relish.rs @@ -1,30 +1,3 @@ -//! -//! FIXME: relishes are obsolete -//! -//! Zenith stores PostgreSQL relations, and some other files, in the -//! repository. The relations (i.e. tables and indexes) take up most -//! of the space in a typical installation, while the other files are -//! small. We call each relation and other file that is stored in the -//! repository a "relish". It comes from "rel"-ish, as in "kind of a -//! rel", because it covers relations as well as other things that are -//! not relations, but are treated similarly for the purposes of the -//! storage layer. -//! -//! This source file contains the definition of the RelishTag struct, -//! which uniquely identifies a relish. -//! -//! Relishes come in two flavors: blocky and non-blocky. Relations and -//! SLRUs are blocky, that is, they are divided into 8k blocks, and -//! the repository tracks their size. Other relishes are non-blocky: -//! the content of the whole relish is stored as one blob. Block -//! number must be passed as 0 for all operations on a non-blocky -//! relish. The one "block" that you store in a non-blocky relish can -//! have arbitrary size, but they are expected to be small, or you -//! will have performance issues. -//! -//! All relishes are versioned by LSN in the repository. -//! - use serde::{Deserialize, Serialize}; use std::cmp::Ordering; use std::fmt; diff --git a/pageserver/src/thread_mgr.rs b/pageserver/src/thread_mgr.rs index 46aa391241..7a4e82ad86 100644 --- a/pageserver/src/thread_mgr.rs +++ b/pageserver/src/thread_mgr.rs @@ -98,7 +98,7 @@ pub enum ThreadKind { // Thread that handles GC of a tenant GarbageCollector, - // FIXME + // Thread that flushes frozen in-memory layers to disk LayerFlushThread, // Thread for synchronizing pageserver relish data with the remote storage. From 35584f7242db6e54a8d0b3291eeb21da2cfdb21a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 11:51:52 +0200 Subject: [PATCH 52/55] Bump magic IDs, to distinguish old file format from new --- pageserver/src/layered_repository/delta_layer.rs | 2 +- pageserver/src/layered_repository/image_layer.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index 82dd516990..fd4a21cc14 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -62,7 +62,7 @@ use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; // Magic constant to identify a Zenith delta file -pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01; +pub const DELTA_FILE_MAGIC: u32 = 0x5A616E11; /// Mapping from (key, lsn) -> page/WAL record /// byte ranges in VALUES_CHAPTER diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index 2fa6bb8eee..ecea5b4fcf 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -44,8 +44,7 @@ use zenith_utils::bin_ser::BeSer; use zenith_utils::lsn::Lsn; // Magic constant to identify a Zenith image layer file -// FIXME: bump all magics -pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1; +pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E11 + 1; /// Mapping from (key, lsn) -> page/WAL record /// byte ranges in VALUES_CHAPTER @@ -101,7 +100,8 @@ pub struct ImageLayerInner { /// If false, the 'index' has not been loaded into memory yet. loaded: bool, - /// If None, the 'image_type' has not been loaded into memory yet. FIXME + /// The underlying (virtual) file handle. None if the layer hasn't been loaded + /// yet. book: Option>, /// offset of each value From 8c4d270cde29c4af441b8583e8dca882270ec9cf Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 11:52:17 +0200 Subject: [PATCH 53/55] Fix InMemoryLayer::dump --- .../src/layered_repository/inmemory_layer.rs | 51 ++++++++++++------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index dc1177e76c..145dbeecb1 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -12,10 +12,14 @@ use crate::layered_repository::storage_layer::{ }; use crate::layered_repository::utils; use crate::repository::{Key, Value}; +use crate::walrecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use log::*; use std::collections::HashMap; +// avoid binding to Write (conflicts with std::io::Write) +// while being able to use std::fmt::Write's methods +use std::fmt::Write as _; use std::ops::Range; use std::path::PathBuf; use std::sync::RwLock; @@ -194,27 +198,36 @@ impl Layer for InMemoryLayer { println!( "----- in-memory layer for tli {} LSNs {}-{} ----", - self.timelineid, - self.start_lsn, - end_str, - //inner.dropped, + self.timelineid, self.start_lsn, end_str, ); - // FIXME - /* - for (blknum, versions) in page_versions { - for (lsn, off) in versions.as_slice() { - let pv = inner.read_pv(*off); - let pv_description = match pv { - Ok(PageVersion::Page(_img)) => "page", - Ok(PageVersion::Wal(_rec)) => "wal", - Err(_err) => "INVALID", - }; - - println!("blk {} at {}: {}\n", blknum, lsn, pv_description); - } - } - */ + let mut buf = Vec::new(); + for (key, vec_map) in inner.index.iter() { + for (lsn, pos) in vec_map.as_slice() { + let mut desc = String::new(); + let len = utils::read_blob_buf(&inner.file, *pos, &mut buf)?; + let val = Value::des(&buf[0..len]); + match val { + Ok(Value::Image(img)) => { + write!(&mut desc, " img {} bytes", img.len())?; + } + Ok(Value::WalRecord(rec)) => { + let wal_desc = walrecord::describe_wal_record(&rec); + write!( + &mut desc, + " rec {} bytes will_init: {} {}", + buf.len(), + rec.will_init(), + wal_desc + )?; + } + Err(err) => { + write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?; + } + } + println!(" key {} at {}: {}", key, lsn, desc); + } + } Ok(()) } From d75692122093b4784e8b0a951b41e7aef2f94657 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 18 Mar 2022 14:18:25 +0200 Subject: [PATCH 54/55] RFC fixes, per comments in the PR --- docs/rfcs/014-storage-lsm.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/rfcs/014-storage-lsm.md b/docs/rfcs/014-storage-lsm.md index f91ccda6c0..c6f8eb5914 100644 --- a/docs/rfcs/014-storage-lsm.md +++ b/docs/rfcs/014-storage-lsm.md @@ -7,11 +7,13 @@ existing files are never modified. That fits well with storing the files on S3. Currently, we create a lot of small files. That is mostly a problem -with S3, because each GET/PUT operation is expensive. Currently, the -files "archived" together into larger checkpoint files before they're -uploaded to S3, but garbage collecting data from the archive files -would be difficult and we have not implemented it. This proposal -addresses that problem. +with S3, because each GET/PUT operation is expensive, and LIST +operation only returns 1000 objects at a time, and isn't free +either. Currently, the files are "archived" together into larger +checkpoint files before they're uploaded to S3 to alleviate that +problem, but garbage collecting data from the archive files would be +difficult and we have not implemented it. This proposal addresses that +problem. # Overview @@ -98,7 +100,8 @@ the overall key space, and a larger range of LSNs. This speeds up searches. When you're looking for a given page, you need to check all the files in L0, to see if they contain a page version for the requested page. But in L1, you only need to check the files whose key range covers -the requested page. +the requested page. This is particularly important at cold start, when +checking a file means downloading it from S3. Partitioning by key range also helps with garbage collection. If only a part of the database is updated, we will accumulate more files for @@ -133,13 +136,6 @@ we partition the data into the files? for how PebblesDB does this, and for why that's important) - Greedy algorithm -# Next steps - -- Allow delta layers to cover a range keys instead of a single segment. - -- Implement a two-level LSM tree (or three-leveled, if you count the -"memtable"), by adding L0. - # Additional Reading [1] Paper on PebblesDB and how it does partitioning. From a39de2997ff159451b6da9f94dbb99ed0bf71a90 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Wed, 16 Mar 2022 18:01:24 +0300 Subject: [PATCH 55/55] Optimize reading versions for delta_layer Store blob size in layer metadata for all layers types Heikki: This is a squashed version of PR #1369 --- pageserver/src/layered_repository.rs | 1 - .../src/layered_repository/delta_layer.rs | 118 ++++++++++-------- .../src/layered_repository/image_layer.rs | 47 ++++--- .../src/layered_repository/inmemory_layer.rs | 37 +++--- .../src/layered_repository/storage_layer.rs | 34 +++++ pageserver/src/layered_repository/utils.rs | 53 -------- pageserver/src/repository.rs | 7 ++ 7 files changed, 157 insertions(+), 140 deletions(-) delete mode 100644 pageserver/src/layered_repository/utils.rs diff --git a/pageserver/src/layered_repository.rs b/pageserver/src/layered_repository.rs index 53dfd371e1..ca4dc7d6fe 100644 --- a/pageserver/src/layered_repository.rs +++ b/pageserver/src/layered_repository.rs @@ -62,7 +62,6 @@ mod layer_map; pub mod metadata; mod par_fsync; mod storage_layer; -mod utils; use delta_layer::{DeltaLayer, DeltaLayerWriter}; use ephemeral_file::is_ephemeral_file; diff --git a/pageserver/src/layered_repository/delta_layer.rs b/pageserver/src/layered_repository/delta_layer.rs index fd4a21cc14..56fd86b4c0 100644 --- a/pageserver/src/layered_repository/delta_layer.rs +++ b/pageserver/src/layered_repository/delta_layer.rs @@ -33,9 +33,8 @@ use crate::config::PageServerConf; use crate::layered_repository::filename::{DeltaFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, + BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::utils; use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; use crate::walrecord; @@ -122,7 +121,7 @@ pub struct DeltaLayerInner { /// Indexed by block number and LSN. The value is an offset into the /// chapter where the page version is stored. /// - index: HashMap>, + index: HashMap>, book: Option>, } @@ -170,22 +169,36 @@ impl Layer for DeltaLayer { // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - let val = Value::des(&utils::read_blob_from_chapter(&values_reader, *pos)?)?; - match val { - Value::Image(img) => { - reconstruct_state.img = Some((*entry_lsn, img)); - need_image = false; - break; - } - Value::WalRecord(rec) => { - let will_init = rec.will_init(); - reconstruct_state.records.push((*entry_lsn, rec)); - if will_init { - // This WAL record initializes the page, so no need to go further back + let mut size = 0usize; + let mut first_pos = 0u64; + for (_entry_lsn, blob_ref) in slice.iter().rev() { + size += blob_ref.size(); + first_pos = blob_ref.pos(); + if blob_ref.will_init() { + break; + } + } + if size != 0 { + let mut buf = vec![0u8; size]; + values_reader.read_exact_at(&mut buf, first_pos)?; + for (entry_lsn, blob_ref) in slice.iter().rev() { + let offs = (blob_ref.pos() - first_pos) as usize; + let val = Value::des(&buf[offs..offs + blob_ref.size()])?; + match val { + Value::Image(img) => { + reconstruct_state.img = Some((*entry_lsn, img)); need_image = false; break; } + Value::WalRecord(rec) => { + let will_init = rec.will_init(); + reconstruct_state.records.push((*entry_lsn, rec)); + if will_init { + // This WAL record initializes the page, so no need to go further back + need_image = false; + break; + } + } } } } @@ -205,9 +218,6 @@ impl Layer for DeltaLayer { fn iter(&self) -> Box> + '_> { let inner = self.load().unwrap(); - let mut pairs: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); - pairs.sort_by_key(|x| x.0); - match DeltaValueIter::new(inner) { Ok(iter) => Box::new(iter), Err(err) => Box::new(std::iter::once(Err(err))), @@ -274,14 +284,14 @@ impl Layer for DeltaLayer { let book = Book::new(file)?; let chapter = book.chapter_reader(VALUES_CHAPTER)?; - let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); + let mut values: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); values.sort_by_key(|k| k.0); for (key, versions) in values { - for (lsn, off) in versions.as_slice() { + for (lsn, blob_ref) in versions.as_slice() { let mut desc = String::new(); - - let buf = utils::read_blob_from_chapter(&chapter, *off)?; + let mut buf = vec![0u8; blob_ref.size()]; + chapter.read_exact_at(&mut buf, blob_ref.pos())?; let val = Value::des(&buf); match val { @@ -468,7 +478,7 @@ pub struct DeltaLayerWriter { key_start: Key, lsn_range: Range, - index: HashMap>, + index: HashMap>, values_writer: ChapterWriter>, end_offset: u64, @@ -529,10 +539,13 @@ impl DeltaLayerWriter { // Remember the offset and size metadata. The metadata is written // to a separate chapter, in `finish`. let off = self.end_offset; - let len = utils::write_blob(&mut self.values_writer, &Value::ser(&val)?)?; - self.end_offset += len; + let buf = Value::ser(&val)?; + let len = buf.len(); + self.values_writer.write_all(&buf)?; + self.end_offset += len as u64; let vec_map = self.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + let blob_ref = BlobRef::new(off, len, val.will_init()); + let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. bail!( @@ -637,14 +650,13 @@ impl DeltaLayerWriter { /// That takes up quite a lot of memory. Should do this in a more streaming /// fashion. /// -struct DeltaValueIter<'a> { - all_offsets: Vec<(Key, Lsn, u64)>, +struct DeltaValueIter { + all_offsets: Vec<(Key, Lsn, BlobRef)>, next_idx: usize, - - inner: RwLockReadGuard<'a, DeltaLayerInner>, + data: Vec, } -impl<'a> Iterator for DeltaValueIter<'a> { +impl Iterator for DeltaValueIter { type Item = Result<(Key, Lsn, Value)>; fn next(&mut self) -> Option { @@ -652,38 +664,40 @@ impl<'a> Iterator for DeltaValueIter<'a> { } } -impl<'a> DeltaValueIter<'a> { - fn new(inner: RwLockReadGuard<'a, DeltaLayerInner>) -> Result { - let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); +impl DeltaValueIter { + fn new(inner: RwLockReadGuard) -> Result { + let mut index: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); index.sort_by_key(|x| x.0); - let mut all_offsets: Vec<(Key, Lsn, u64)> = Vec::new(); + let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new(); for (key, vec_map) in index.iter() { - for (lsn, off) in vec_map.as_slice().iter() { - all_offsets.push((**key, *lsn, *off)); + for (lsn, blob_ref) in vec_map.as_slice().iter() { + all_offsets.push((**key, *lsn, *blob_ref)); } } - Ok(DeltaValueIter { + let values_reader = inner + .book + .as_ref() + .expect("should be loaded in load call above") + .chapter_reader(VALUES_CHAPTER)?; + let file_size = values_reader.len() as usize; + let mut layer = DeltaValueIter { all_offsets, - inner, next_idx: 0, - }) + data: vec![0u8; file_size], + }; + values_reader.read_exact_at(&mut layer.data, 0)?; + + Ok(layer) } fn next_res(&mut self) -> Result> { if self.next_idx < self.all_offsets.len() { - let (key, lsn, off) = self.all_offsets[self.next_idx]; - - let values_reader = self - .inner - .book - .as_ref() - .expect("should be loaded in load call above") - .chapter_reader(VALUES_CHAPTER)?; - - let val = Value::des(&utils::read_blob_from_chapter(&values_reader, off)?)?; - + let (key, lsn, blob_ref) = self.all_offsets[self.next_idx]; + let offs = blob_ref.pos() as usize; + let size = blob_ref.size(); + let val = Value::des(&self.data[offs..offs + size])?; self.next_idx += 1; Ok(Some((key, lsn, val))) } else { diff --git a/pageserver/src/layered_repository/image_layer.rs b/pageserver/src/layered_repository/image_layer.rs index ecea5b4fcf..948e5b1433 100644 --- a/pageserver/src/layered_repository/image_layer.rs +++ b/pageserver/src/layered_repository/image_layer.rs @@ -21,9 +21,8 @@ use crate::config::PageServerConf; use crate::layered_repository::filename::{ImageFileName, PathOrConf}; use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, + BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::utils; use crate::repository::{Key, Value}; use crate::virtual_file::VirtualFile; use crate::{ZTenantId, ZTimelineId}; @@ -105,7 +104,7 @@ pub struct ImageLayerInner { book: Option>, /// offset of each value - index: HashMap, + index: HashMap, } impl Layer for ImageLayer { @@ -142,20 +141,24 @@ impl Layer for ImageLayer { let inner = self.load()?; - if let Some(offset) = inner.index.get(&key) { + if let Some(blob_ref) = inner.index.get(&key) { let chapter = inner .book .as_ref() .unwrap() .chapter_reader(VALUES_CHAPTER)?; - let blob = utils::read_blob_from_chapter(&chapter, *offset).with_context(|| { - format!( - "failed to read value from data file {} at offset {}", - self.filename().display(), - offset - ) - })?; + let mut blob = vec![0; blob_ref.size()]; + chapter + .read_exact_at(&mut blob, blob_ref.pos()) + .with_context(|| { + format!( + "failed to read {} bytes from data file {} at offset {}", + blob_ref.size(), + self.filename().display(), + blob_ref.pos() + ) + })?; let value = Bytes::from(blob); reconstruct_state.img = Some((self.lsn, value)); @@ -215,11 +218,16 @@ impl Layer for ImageLayer { let inner = self.load()?; - let mut index_vec: Vec<(&Key, &u64)> = inner.index.iter().collect(); - index_vec.sort_by_key(|x| x.1); + let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect(); + index_vec.sort_by_key(|x| x.1.pos()); - for (key, offset) in index_vec { - println!("key: {} offset {}", key, offset); + for (key, blob_ref) in index_vec { + println!( + "key: {} size {} offset {}", + key, + blob_ref.size(), + blob_ref.pos() + ); } Ok(()) @@ -385,7 +393,7 @@ pub struct ImageLayerWriter { values_writer: Option>>, end_offset: u64, - index: HashMap, + index: HashMap, finished: bool, } @@ -446,10 +454,11 @@ impl ImageLayerWriter { let off = self.end_offset; if let Some(writer) = &mut self.values_writer { - let len = utils::write_blob(writer, img)?; - self.end_offset += len; + let len = img.len(); + writer.write_all(img)?; + self.end_offset += len as u64; - let old = self.index.insert(key, off); + let old = self.index.insert(key, BlobRef::new(off, len, true)); assert!(old.is_none()); } else { panic!() diff --git a/pageserver/src/layered_repository/inmemory_layer.rs b/pageserver/src/layered_repository/inmemory_layer.rs index 145dbeecb1..1e2f4f52df 100644 --- a/pageserver/src/layered_repository/inmemory_layer.rs +++ b/pageserver/src/layered_repository/inmemory_layer.rs @@ -8,9 +8,8 @@ use crate::config::PageServerConf; use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter}; use crate::layered_repository::ephemeral_file::EphemeralFile; use crate::layered_repository::storage_layer::{ - Layer, ValueReconstructResult, ValueReconstructState, + BlobRef, Layer, ValueReconstructResult, ValueReconstructState, }; -use crate::layered_repository::utils; use crate::repository::{Key, Value}; use crate::walrecord; use crate::{ZTenantId, ZTimelineId}; @@ -20,7 +19,9 @@ use std::collections::HashMap; // avoid binding to Write (conflicts with std::io::Write) // while being able to use std::fmt::Write's methods use std::fmt::Write as _; +use std::io::Write; use std::ops::Range; +use std::os::unix::fs::FileExt; use std::path::PathBuf; use std::sync::RwLock; use zenith_utils::bin_ser::BeSer; @@ -53,7 +54,7 @@ pub struct InMemoryLayerInner { /// by block number and LSN. The value is an offset into the /// ephemeral file where the page version is stored. /// - index: HashMap>, + index: HashMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -122,7 +123,7 @@ impl Layer for InMemoryLayer { // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { + for (entry_lsn, blob_ref) in slice.iter().rev() { match &reconstruct_state.img { Some((cached_lsn, _)) if entry_lsn <= cached_lsn => { return Ok(ValueReconstructResult::Complete) @@ -130,7 +131,9 @@ impl Layer for InMemoryLayer { _ => {} } - let value = Value::des(&utils::read_blob(&inner.file, *pos)?)?; + let mut buf = vec![0u8; blob_ref.size()]; + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let value = Value::des(&buf)?; match value { Value::Image(img) => { reconstruct_state.img = Some((*entry_lsn, img)); @@ -203,10 +206,11 @@ impl Layer for InMemoryLayer { let mut buf = Vec::new(); for (key, vec_map) in inner.index.iter() { - for (lsn, pos) in vec_map.as_slice() { + for (lsn, blob_ref) in vec_map.as_slice() { let mut desc = String::new(); - let len = utils::read_blob_buf(&inner.file, *pos, &mut buf)?; - let val = Value::des(&buf[0..len]); + buf.resize(blob_ref.size(), 0); + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let val = Value::des(&buf); match val { Ok(Value::Image(img)) => { write!(&mut desc, " img {} bytes", img.len())?; @@ -276,11 +280,14 @@ impl InMemoryLayer { inner.assert_writeable(); let off = inner.end_offset; - let len = utils::write_blob(&mut inner.file, &Value::ser(&val)?)?; - inner.end_offset += len; + let buf = Value::ser(&val)?; + let len = buf.len(); + inner.file.write_all(&buf)?; + inner.end_offset += len as u64; let vec_map = inner.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + let blob_ref = BlobRef::new(off, len, val.will_init()); + let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0; if old.is_some() { // We already had an entry for this LSN. That's odd.. warn!("Key {} at {} already exists", key, lsn); @@ -348,13 +355,13 @@ impl InMemoryLayer { self.start_lsn..inner.end_lsn.unwrap(), )?; - let mut buf = Vec::new(); let mut do_steps = || -> Result<()> { for (key, vec_map) in inner.index.iter() { // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - let len = utils::read_blob_buf(&inner.file, *pos, &mut buf)?; - let val = Value::des(&buf[0..len])?; + for (lsn, blob_ref) in vec_map.as_slice() { + let mut buf = vec![0u8; blob_ref.size()]; + inner.file.read_exact_at(&mut buf, blob_ref.pos())?; + let val = Value::des(&buf)?; delta_layer_writer.put_value(*key, *lsn, val)?; } } diff --git a/pageserver/src/layered_repository/storage_layer.rs b/pageserver/src/layered_repository/storage_layer.rs index c5314350c8..5847f9cb75 100644 --- a/pageserver/src/layered_repository/storage_layer.rs +++ b/pageserver/src/layered_repository/storage_layer.rs @@ -7,6 +7,7 @@ use crate::walrecord::ZenithWalRecord; use crate::{ZTenantId, ZTimelineId}; use anyhow::Result; use bytes::Bytes; +use serde::{Deserialize, Serialize}; use std::ops::Range; use std::path::PathBuf; @@ -144,3 +145,36 @@ pub trait Layer: Send + Sync { /// Dump summary of the contents of the layer to stdout fn dump(&self) -> Result<()>; } + +// Flag indicating that this version initialize the page +const WILL_INIT: u64 = 1; + +/// +/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size. +/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records +/// which needs to be applied without reading/deserializing records themselves. +/// +#[derive(Debug, Serialize, Deserialize, Copy, Clone)] +pub struct BlobRef(u64); + +impl BlobRef { + pub fn will_init(&self) -> bool { + (self.0 & WILL_INIT) != 0 + } + + pub fn pos(&self) -> u64 { + self.0 >> 32 + } + + pub fn size(&self) -> usize { + ((self.0 & 0xFFFFFFFF) >> 1) as usize + } + + pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef { + let mut blob_ref = (pos << 32) | ((size as u64) << 1); + if will_init { + blob_ref |= WILL_INIT; + } + BlobRef(blob_ref) + } +} diff --git a/pageserver/src/layered_repository/utils.rs b/pageserver/src/layered_repository/utils.rs deleted file mode 100644 index b3aa8c7ef4..0000000000 --- a/pageserver/src/layered_repository/utils.rs +++ /dev/null @@ -1,53 +0,0 @@ -// Utilities for reading and writing Values -use std::io::{Error, Write}; -use std::os::unix::fs::FileExt; - -use bookfile::BoundedReader; - -pub fn read_blob_buf(file: &F, off: u64, buf: &mut Vec) -> Result { - // read length - let mut len_buf = [0u8; 4]; - file.read_exact_at(&mut len_buf, off)?; - - let len = u32::from_ne_bytes(len_buf) as usize; - - buf.resize(len, 0); - file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?; - - Ok(len) -} - -pub fn read_blob(file: &F, off: u64) -> Result, Error> { - let mut buf: Vec = Vec::new(); - let _ = read_blob_buf(file, off, &mut buf); - Ok(buf) -} - -pub fn read_blob_from_chapter( - file: &BoundedReader<&F>, - off: u64, -) -> Result, Error> { - // read length - let mut len_buf = [0u8; 4]; - file.read_exact_at(&mut len_buf, off)?; - - let len = u32::from_ne_bytes(len_buf); - - let mut buf: Vec = Vec::new(); - buf.resize(len as usize, 0); - file.read_exact_at(&mut buf.as_mut_slice(), off + 4)?; - - Ok(buf) -} - -pub fn write_blob(writer: &mut W, buf: &[u8]) -> Result { - let val_len = buf.len() as u32; - - // write the 'length' field and kind byte. - let lenbuf = u32::to_ne_bytes(val_len); - - writer.write_all(&lenbuf)?; - writer.write_all(buf)?; - - Ok(4 + val_len as u64) -} diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs index 56bd5208ca..4eb352f68d 100644 --- a/pageserver/src/repository.rs +++ b/pageserver/src/repository.rs @@ -154,6 +154,13 @@ impl Value { pub fn is_image(&self) -> bool { matches!(self, Value::Image(_)) } + + pub fn will_init(&self) -> bool { + match self { + Value::Image(_) => true, + Value::WalRecord(rec) => rec.will_init(), + } + } } ///