wip: avoid memory allocation during WAL ingestion

This commit is contained in:
Thang Pham
2022-06-28 14:55:30 -04:00
parent 4a05413a4c
commit eb424f851b
5 changed files with 126 additions and 42 deletions

View File

@@ -62,6 +62,7 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
// We're done importing all the data files.
modification.commit()?;
drop(modification);
// We expect the Postgres server to be shut down cleanly.
let pg_control = pg_control.context("pg_control file not found")?;
@@ -269,7 +270,8 @@ fn import_wal<R: Repository>(
let mut nrecords = 0;
while last_lsn <= endpoint {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(tline, recdata, lsn)?;
let mut modification = tline.begin_modification(lsn);
walingest.ingest_record(recdata, lsn, &mut modification)?;
last_lsn = lsn;
nrecords += 1;
@@ -384,7 +386,8 @@ pub fn import_wal_from_tar<R: Repository, Reader: Read>(
while last_lsn <= end_lsn {
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
walingest.ingest_record(tline, recdata, lsn)?;
let mut modification = tline.begin_modification(lsn);
walingest.ingest_record(recdata, lsn, &mut modification)?;
last_lsn = lsn;
debug!("imported record at {} (end {})", lsn, end_lsn);

View File

@@ -99,6 +99,7 @@ impl<R: Repository> DatadirTimeline<R> {
pending_updates: HashMap::new(),
pending_deletions: Vec::new(),
pending_nblocks: 0,
writer: self.tline.writer(),
}
}
@@ -532,7 +533,7 @@ pub struct DatadirModification<'a, R: Repository> {
/// in the state in 'tline' yet.
pub tline: &'a DatadirTimeline<R>,
lsn: Lsn,
pub lsn: Lsn,
// The modifications are not applied directly to the underlying key-value store.
// The put-functions add the modifications here, and they are flushed to the
@@ -540,9 +541,17 @@ pub struct DatadirModification<'a, R: Repository> {
pending_updates: HashMap<Key, Value>,
pending_deletions: Vec<Range<Key>>,
pending_nblocks: isize,
writer: Box<dyn TimelineWriter<'a> + 'a>,
}
impl<'a, R: Repository> DatadirModification<'a, R> {
pub fn clear(&mut self) {
self.pending_updates.clear();
self.pending_deletions.clear();
self.pending_nblocks = 0;
}
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -905,19 +914,17 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
/// Finish this atomic update, writing all the updated keys to the
/// underlying timeline.
///
pub fn commit(self) -> Result<()> {
let writer = self.tline.tline.writer();
pub fn commit(&self) -> Result<()> {
let pending_nblocks = self.pending_nblocks;
for (key, value) in self.pending_updates {
writer.put(key, self.lsn, value)?;
for (key, value) in &self.pending_updates {
self.writer.put(key.clone(), self.lsn, value.clone())?;
}
for key_range in self.pending_deletions {
writer.delete(key_range.clone(), self.lsn)?;
for key_range in &self.pending_deletions {
self.writer.delete(key_range.clone(), self.lsn)?;
}
writer.finish_write(self.lsn);
self.writer.finish_write(self.lsn);
if pending_nblocks != 0 {
self.tline.current_logical_size.fetch_add(

View File

@@ -78,12 +78,10 @@ impl<'a, R: Repository> WalIngest<'a, R> {
///
pub fn ingest_record(
&mut self,
timeline: &DatadirTimeline<R>,
recdata: Bytes,
lsn: Lsn,
modification: &mut DatadirModification<R>,
) -> Result<()> {
let mut modification = timeline.begin_modification(lsn);
let mut decoded = decode_wal_record(recdata).context("failed decoding wal record")?;
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);
@@ -98,7 +96,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
if decoded.xl_rmid == pg_constants::RM_HEAP_ID
|| decoded.xl_rmid == pg_constants::RM_HEAP2_ID
{
self.ingest_heapam_record(&mut buf, &mut modification, &mut decoded)?;
self.ingest_heapam_record(&mut buf, modification, &mut decoded)?;
}
// Handle other special record types
if decoded.xl_rmid == pg_constants::RM_SMGR_ID
@@ -106,19 +104,19 @@ impl<'a, R: Repository> WalIngest<'a, R> {
== pg_constants::XLOG_SMGR_CREATE
{
let create = XlSmgrCreate::decode(&mut buf);
self.ingest_xlog_smgr_create(&mut modification, &create)?;
self.ingest_xlog_smgr_create(modification, &create)?;
} else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
&& (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_SMGR_TRUNCATE
{
let truncate = XlSmgrTruncate::decode(&mut buf);
self.ingest_xlog_smgr_truncate(&mut modification, &truncate)?;
self.ingest_xlog_smgr_truncate(modification, &truncate)?;
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_DBASE_CREATE
{
let createdb = XlCreateDatabase::decode(&mut buf);
self.ingest_xlog_dbase_create(&mut modification, &createdb)?;
self.ingest_xlog_dbase_create(modification, &createdb)?;
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
== pg_constants::XLOG_DBASE_DROP
{
@@ -137,7 +135,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
&mut modification,
modification,
SlruKind::Clog,
segno,
rpageno,
@@ -146,7 +144,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
} else {
assert!(info == pg_constants::CLOG_TRUNCATE);
let xlrec = XlClogTruncate::decode(&mut buf);
self.ingest_clog_truncate_record(&mut modification, &xlrec)?;
self.ingest_clog_truncate_record(modification, &xlrec)?;
}
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
@@ -154,7 +152,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
&mut modification,
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT,
)?;
@@ -164,7 +162,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
let parsed_xact =
XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
self.ingest_xact_record(
&mut modification,
modification,
&parsed_xact,
info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
)?;
@@ -187,7 +185,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
&mut modification,
modification,
SlruKind::MultiXactOffsets,
segno,
rpageno,
@@ -198,7 +196,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
self.put_slru_page_image(
&mut modification,
modification,
SlruKind::MultiXactMembers,
segno,
rpageno,
@@ -206,14 +204,14 @@ impl<'a, R: Repository> WalIngest<'a, R> {
)?;
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
let xlrec = XlMultiXactCreate::decode(&mut buf);
self.ingest_multixact_create_record(&mut modification, &xlrec)?;
self.ingest_multixact_create_record(modification, &xlrec)?;
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
let xlrec = XlMultiXactTruncate::decode(&mut buf);
self.ingest_multixact_truncate_record(&mut modification, &xlrec)?;
self.ingest_multixact_truncate_record(modification, &xlrec)?;
}
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
let xlrec = XlRelmapUpdate::decode(&mut buf);
self.ingest_relmap_page(&mut modification, &xlrec, &decoded)?;
self.ingest_relmap_page(modification, &xlrec, &decoded)?;
} else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
if info == pg_constants::XLOG_NEXTOID {
@@ -248,7 +246,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
// Iterate through all the blocks that the record modifies, and
// "put" a separate copy of the record for each block.
for blk in decoded.blocks.iter() {
self.ingest_decoded_block(&mut modification, lsn, &decoded, blk)?;
self.ingest_decoded_block(modification, lsn, &decoded, blk)?;
}
// If checkpoint data was updated, store the new version in the repository

View File

@@ -150,20 +150,34 @@ pub async fn handle_walreceiver_connection(
waldecoder.feed_bytes(data);
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let _enter = info_span!("processing record", lsn = %lsn).entered();
// let mut n_records = 0;
// timer = std::time::Instant::now();
{
let mut modification = timeline.begin_modification(last_rec_lsn);
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
ensure!(lsn.is_aligned());
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
let _enter = info_span!("processing record", lsn = %lsn).entered();
walingest.ingest_record(&timeline, recdata, lsn)?;
// It is important to deal with the aligned records as lsn in getPage@LSN is
// aligned and can be several bytes bigger. Without this alignment we are
// at risk of hitting a deadlock.
ensure!(lsn.is_aligned());
fail_point!("walreceiver-after-ingest");
modification.clear();
modification.lsn = lsn;
walingest.ingest_record(recdata, lsn, &mut modification)?;
last_rec_lsn = lsn;
fail_point!("walreceiver-after-ingest");
last_rec_lsn = lsn;
// n_records += 1;
}
}
// info!(
// "Processing {n_records} records took {}us",
// timer.elapsed().as_micros()
// );
if !caught_up && endlsn >= end_of_wal {
info!("caught up at LSN {endlsn}");

View File

@@ -1,8 +1,11 @@
import os
import statistics
from typing import List
import time
import pytest
from fixtures.compare_fixtures import PgCompare
from fixtures.benchmark_fixture import MetricReport
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.pg_stats import PgStatTable
from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix
@@ -27,8 +30,14 @@ def test_compare_pg_stats_rw_with_pgbench_default(neon_with_baseline: PgCompare,
env.flush()
with env.record_pg_stats(pg_stats_rw):
env.pg_bin.run_capture(
['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()])
env.pg_bin.run_capture([
'pgbench',
f'-T{duration}',
f'--random-seed={seed}',
'-Mprepared',
'-r',
env.pg.connstr()
])
env.flush()
@@ -51,6 +60,7 @@ def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCo
'-N',
f'-T{duration}',
f'--random-seed={seed}',
'-r',
'-Mprepared',
env.pg.connstr()
])
@@ -60,12 +70,12 @@ def test_compare_pg_stats_wo_with_pgbench_simple_update(neon_with_baseline: PgCo
@pytest.mark.parametrize("seed", get_seeds_matrix())
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix(5))
def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgCompare,
def test_compare_pg_stats_ro_with_pgbench_select_only(neon_compare: NeonCompare,
seed: int,
scale: int,
duration: int,
pg_stats_ro: List[PgStatTable]):
env = neon_with_baseline
env = neon_compare
# initialize pgbench
env.pg_bin.run_capture(['pgbench', f'-s{scale}', '-i', env.pg.connstr()])
env.flush()
@@ -77,6 +87,7 @@ def test_compare_pg_stats_ro_with_pgbench_select_only(neon_with_baseline: PgComp
f'-T{duration}',
f'--random-seed={seed}',
'-Mprepared',
'-r',
env.pg.connstr()
])
env.flush()
@@ -99,3 +110,54 @@ def test_compare_pg_stats_wal_with_pgbench_default(neon_with_baseline: PgCompare
env.pg_bin.run_capture(
['pgbench', f'-T{duration}', f'--random-seed={seed}', '-Mprepared', env.pg.connstr()])
env.flush()
@pytest.mark.parametrize("duration", get_durations_matrix(5))
def test_compare_pg_stats_ro_with_simple_read(neon_with_baseline: PgCompare,
duration: int,
pg_stats_ro: List[PgStatTable]):
env = neon_with_baseline
# initialize test table
with env.pg.connect().cursor() as cur:
cur.execute("CREATE TABLE foo as select generate_series(1,100000)")
start = time.time()
read_latencies = []
with env.record_pg_stats(pg_stats_ro):
with env.pg.connect().cursor() as cur:
while time.time() - start < duration:
t = time.time()
cur.execute("SELECT * from foo")
read_latencies.append(time.time() - t)
env.zenbenchmark.record("read_latency_max",
max(read_latencies),
's',
MetricReport.LOWER_IS_BETTER)
env.zenbenchmark.record("read_latency_avg",
statistics.mean(read_latencies),
's',
MetricReport.LOWER_IS_BETTER)
env.zenbenchmark.record("read_latency_stdev",
statistics.stdev(read_latencies),
's',
MetricReport.LOWER_IS_BETTER)
@pytest.mark.parametrize("duration", get_durations_matrix(10))
def test_compare_pg_stats_wo_with_simple_write(neon_compare: NeonCompare,
duration: int,
pg_stats_wo: List[PgStatTable]):
env = neon_compare
with env.pg.connect().cursor() as cur:
cur.execute(
"CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
)
start = time.time()
with env.record_pg_stats(pg_stats_wo):
with env.pg.connect().cursor() as cur:
while time.time() - start < duration:
cur.execute("INSERT INTO foo SELECT FROM generate_series(1,100000)")