mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 13:30:38 +00:00
Major storage format rewrite.
This is a backwards-incompatible change. The new pageserver cannot read repositories created with an old pageserver binary, or vice versa. Simplify Repository to a value-store ------------------------------------ Move the responsibility of tracking relation metadata, like which relations exist and what are their sizes, from Repository to a new module, pgdatadir_mapping.rs. The interface to Repository is now a simple key-value PUT/GET operations. It's still not any old key-value store though. A Repository is still responsible from handling branching, and every GET operation comes with an LSN. Mapping from Postgres data directory to keys/values --------------------------------------------------- All the data is now stored in the key-value store. The 'pgdatadir_mapping.rs' module handles mapping from PostgreSQL objects like relation pages and SLRUs, to key-value pairs. The key to the Repository key-value store is a Key struct, which consists of a few integer fields. It's wide enough to store a full RelFileNode, fork and block number, and to distinguish those from metadata keys. 'pgdatadir_mapping.rs' is also responsible for maintaining a "partitioning" of the keyspace. Partitioning means splitting the keyspace so that each partition holds a roughly equal number of keys. The partitioning is used when new image layer files are created, so that each image layer file is roughly the same size. The partitioning is also responsible for reclaiming space used by deleted keys. The Repository implementation doesn't have any explicit support for deleting keys. Instead, the deleted keys are simply omitted from the partitioning, and when a new image layer is created, the omitted keys are not copied over to the new image layer. We might want to implement tombstone keys in the future, to reclaim space faster, but this will work for now. Changes to low-level layer file code ------------------------------------ The concept of a "segment" is gone. Each layer file can now store an arbitrary range of Keys. Checkpointing, compaction ------------------------- The background tasks are somewhat different now. Whenever checkpoint_distance is reached, the WAL receiver thread "freezes" the current in-memory layer, and creates a new one. This is a quick operation and doesn't perform any I/O yet. It then launches a background "layer flushing thread" to write the frozen layer to disk, as a new L0 delta layer. This mechanism takes care of durability. It replaces the checkpointing thread. Compaction is a new background operation that takes a bunch of L0 delta layers, and reshuffles the data in them. It runs in a separate compaction thread. Deployment ---------- This also contains changes to the ansible scripts that enable having multiple different pageservers running at the same time in the staging environment. We will use that to keep an old version of the pageserver running, for clusters created with the old version, at the same time with a new pageserver with the new binary. Author: Heikki Linnakangas Author: Konstantin Knizhnik <knizhnik@zenith.tech> Author: Andrey Taranik <andrey@zenith.tech> Reviewed-by: Matthias Van De Meent <matthias@zenith.tech> Reviewed-by: Bojan Serafimov <bojan@zenith.tech> Reviewed-by: Konstantin Knizhnik <knizhnik@zenith.tech> Reviewed-by: Anton Shyrabokau <antons@zenith.tech> Reviewed-by: Dhammika Pathirana <dham@zenith.tech> Reviewed-by: Kirill Bulatov <kirill@zenith.tech> Reviewed-by: Anastasia Lubennikova <anastasia@zenith.tech> Reviewed-by: Alexey Kondratov <alexey@zenith.tech>
This commit is contained in:
committed by
Alexey Kondratov
parent
55de0b88f5
commit
07342f7519
2
.circleci/ansible/.gitignore
vendored
Normal file
2
.circleci/ansible/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
@@ -1,14 +1,11 @@
|
||||
- name: Upload Zenith binaries
|
||||
hosts: pageservers:safekeepers
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Zenith binaries
|
||||
ignore_errors: true
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
|
||||
@@ -16,48 +13,13 @@
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: set zero value for current_version
|
||||
when: current_version_file is failed
|
||||
set_fact:
|
||||
current_version: "0"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: get deployed version from content of remote file
|
||||
ignore_errors: true
|
||||
ansible.builtin.slurp:
|
||||
src: /usr/local/.zenith_current_version
|
||||
register: remote_version_file
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: decode remote file content
|
||||
when: remote_version_file is succeeded
|
||||
set_fact:
|
||||
remote_version: "{{ remote_version_file['content'] | b64decode | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: set zero value for remote_version
|
||||
when: remote_version_file is failed
|
||||
set_fact:
|
||||
remote_version: "0"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: inform about versions
|
||||
debug: msg="Version to deploy - {{ current_version }}, version on storage node - {{ remote_version }}"
|
||||
debug: msg="Version to deploy - {{ current_version }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
|
||||
- name: upload and extract Zenith binaries to /usr/local
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
@@ -74,14 +36,24 @@
|
||||
hosts: pageservers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
- name: upload init script
|
||||
when: console_mgmt_base_url is defined
|
||||
ansible.builtin.template:
|
||||
src: scripts/init_pageserver.sh
|
||||
dest: /tmp/init_pageserver.sh
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0755'
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: init pageserver
|
||||
when: current_version > remote_version or force_deploy
|
||||
shell:
|
||||
cmd: sudo -u pageserver /usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
cmd: /tmp/init_pageserver.sh
|
||||
args:
|
||||
creates: "/storage/pageserver/data/tenants"
|
||||
environment:
|
||||
@@ -107,7 +79,6 @@
|
||||
# - pageserver
|
||||
|
||||
- name: upload systemd service definition
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.template:
|
||||
src: systemd/pageserver.service
|
||||
dest: /etc/systemd/system/pageserver.service
|
||||
@@ -119,7 +90,6 @@
|
||||
- pageserver
|
||||
|
||||
- name: start systemd service
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: pageserver
|
||||
@@ -130,7 +100,7 @@
|
||||
- pageserver
|
||||
|
||||
- name: post version to console
|
||||
when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
@@ -142,22 +112,18 @@
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
vars:
|
||||
force_deploy: false
|
||||
|
||||
tasks:
|
||||
|
||||
# in the future safekeepers should discover pageservers byself
|
||||
# but currently use first pageserver that was discovered
|
||||
- name: set first pageserver var for safekeepers
|
||||
when: current_version > remote_version or force_deploy
|
||||
set_fact:
|
||||
first_pageserver: "{{ hostvars[groups['pageservers'][0]]['inventory_hostname'] }}"
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: upload systemd service definition
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.template:
|
||||
src: systemd/safekeeper.service
|
||||
dest: /etc/systemd/system/safekeeper.service
|
||||
@@ -169,7 +135,6 @@
|
||||
- safekeeper
|
||||
|
||||
- name: start systemd service
|
||||
when: current_version > remote_version or force_deploy
|
||||
ansible.builtin.systemd:
|
||||
daemon_reload: yes
|
||||
name: safekeeper
|
||||
@@ -180,7 +145,7 @@
|
||||
- safekeeper
|
||||
|
||||
- name: post version to console
|
||||
when: (current_version > remote_version or force_deploy) and console_mgmt_base_url is defined
|
||||
when: console_mgmt_base_url is defined
|
||||
shell:
|
||||
cmd: |
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
|
||||
@@ -1,7 +1,16 @@
|
||||
[pageservers]
|
||||
zenith-1-ps-1 bucket_name=zenith-storage-oregon bucket_region=us-west-2
|
||||
zenith-1-ps-1 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1
|
||||
zenith-1-sk-2
|
||||
zenith-1-sk-3
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
zenith-1-sk-2 console_region_id=1
|
||||
zenith-1-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
|
||||
30
.circleci/ansible/scripts/init_pageserver.sh
Normal file
30
.circleci/ansible/scripts/init_pageserver.sh
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/bin/sh
|
||||
|
||||
# get instance id from meta-data service
|
||||
INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
|
||||
|
||||
# store fqdn hostname in var
|
||||
HOST=$(hostname -f)
|
||||
|
||||
|
||||
cat <<EOF | tee /tmp/payload
|
||||
{
|
||||
"version": 1,
|
||||
"host": "${HOST}",
|
||||
"port": 6400,
|
||||
"region_id": {{ console_region_id }},
|
||||
"instance_id": "${INSTANCE_ID}",
|
||||
"http_host": "${HOST}",
|
||||
"http_port": 9898
|
||||
}
|
||||
EOF
|
||||
|
||||
# check if pageserver already registered or not
|
||||
if ! curl -sf -X PATCH -d '{}' {{ console_mgmt_base_url }}/api/v1/pageservers/${INSTANCE_ID} -o /dev/null; then
|
||||
|
||||
# not registered, so register it now
|
||||
ID=$(curl -sf -X POST {{ console_mgmt_base_url }}/api/v1/pageservers -d@/tmp/payload | jq -r '.ID')
|
||||
|
||||
# init pageserver
|
||||
sudo -u pageserver /usr/local/bin/pageserver -c "id=${ID}" -c "pg_distrib_dir='/usr/local'" --init -D /storage/pageserver/data
|
||||
fi
|
||||
@@ -1,7 +1,17 @@
|
||||
[pageservers]
|
||||
zenith-us-stage-ps-1 bucket_name=zenith-staging-storage-us-east-1 bucket_region=us-east-1
|
||||
#zenith-us-stage-ps-1 console_region_id=27
|
||||
zenith-us-stage-ps-2 console_region_id=27
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1
|
||||
zenith-us-stage-sk-2
|
||||
zenith-us-stage-sk-3
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-2 console_region_id=27
|
||||
zenith-us-stage-sk-3 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
|
||||
@@ -618,7 +618,7 @@ jobs:
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts -e console_mgmt_base_url=http://console-release.local
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
|
||||
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1467,6 +1467,7 @@ dependencies = [
|
||||
"hex-literal",
|
||||
"humantime",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"nix",
|
||||
|
||||
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.
|
||||
|
||||
### Branch
|
||||
|
||||
We can create branch at certain LSN using `zenith branch` command.
|
||||
We can create branch at certain LSN using `zenith timeline branch` command.
|
||||
Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
|
||||
@@ -36,17 +36,25 @@ A checkpoint record in the WAL marks a point in the WAL sequence at which it is
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Whenever enough WAL has been accumulated in memory, the page server []
|
||||
writes out the changes from in-memory layers into new layer files[]. This process
|
||||
is called "checkpointing". The page server only creates layer files for
|
||||
relations that have been modified since the last checkpoint.
|
||||
writes out the changes from the in-memory layer into a new delta layer file. This process
|
||||
is called "checkpointing".
|
||||
|
||||
Configuration parameter `checkpoint_distance` defines the distance
|
||||
from current LSN to perform checkpoint of in-memory layers.
|
||||
Default is `DEFAULT_CHECKPOINT_DISTANCE`.
|
||||
Set this parameter to `0` to force checkpoint of every layer.
|
||||
|
||||
Configuration parameter `checkpoint_period` defines the interval between checkpoint iterations.
|
||||
Default is `DEFAULT_CHECKPOINT_PERIOD`.
|
||||
### Compaction
|
||||
|
||||
A background operation on layer files. Compaction takes a number of L0
|
||||
layer files, each of which covers the whole key space and a range of
|
||||
LSN, and reshuffles the data in them into L1 files so that each file
|
||||
covers the whole LSN range, but only part of the key space.
|
||||
|
||||
Compaction should also opportunistically leave obsolete page versions
|
||||
from the L1 files, and materialize other page versions for faster
|
||||
access. That hasn't been implemented as of this writing, though.
|
||||
|
||||
|
||||
### Compute node
|
||||
|
||||
Stateless Postgres node that stores data in pageserver.
|
||||
@@ -54,10 +62,10 @@ Stateless Postgres node that stores data in pageserver.
|
||||
### Garbage collection
|
||||
|
||||
The process of removing old on-disk layers that are not needed by any timeline anymore.
|
||||
|
||||
### Fork
|
||||
|
||||
Each of the separate segmented file sets in which a relation is stored. The main fork is where the actual data resides. There also exist two secondary forks for metadata: the free space map and the visibility map.
|
||||
Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer
|
||||
|
||||
@@ -72,15 +80,15 @@ are immutable. See pageserver/src/layered_repository/README.md for more.
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
Layered repository on-disk format is based on immutable files. The
|
||||
files are called "layer files". Each file corresponds to one RELISH_SEG_SIZE
|
||||
segment of a PostgreSQL relation fork. There are two kinds of layer
|
||||
files: image files and delta files. An image file contains a
|
||||
"snapshot" of the segment at a particular LSN, and a delta file
|
||||
contains WAL records applicable to the segment, in a range of LSNs.
|
||||
files are called "layer files". There are two kinds of layer files:
|
||||
image files and delta files. An image file contains a "snapshot" of a
|
||||
range of keys at a particular LSN, and a delta file contains WAL
|
||||
records applicable to a range of keys, in a range of LSNs.
|
||||
|
||||
### Layer map
|
||||
|
||||
The layer map tracks what layers exist for all the relishes in a timeline.
|
||||
The layer map tracks what layers exist in a timeline.
|
||||
|
||||
### Layered repository
|
||||
|
||||
Zenith repository implementation that keeps data in layers.
|
||||
@@ -149,14 +157,6 @@ and create new databases and accounts (control plane API in our case).
|
||||
|
||||
The generic term in PostgreSQL for all objects in a database that have a name and a list of attributes defined in a specific order.
|
||||
|
||||
### Relish
|
||||
|
||||
We call each relation and other file that is stored in the
|
||||
repository a "relish". It comes from "rel"-ish, as in "kind of a
|
||||
rel", because it covers relations as well as other things that are
|
||||
not relations, but are treated similarly for the purposes of the
|
||||
storage layer.
|
||||
|
||||
### Replication slot
|
||||
|
||||
|
||||
@@ -173,27 +173,18 @@ One repository corresponds to one Tenant.
|
||||
|
||||
How much history do we need to keep around for PITR and read-only nodes?
|
||||
|
||||
### Segment (PostgreSQL)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
### Segment
|
||||
|
||||
A physical file that stores data for a given relation. File segments are
|
||||
limited in size by a compile-time setting (1 gigabyte by default), so if a
|
||||
relation exceeds that size, it is split into multiple segments.
|
||||
|
||||
### Segment (Layered Repository)
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
Segment is a RELISH_SEG_SIZE slice of relish (identified by a SegmentTag).
|
||||
|
||||
### SLRU
|
||||
|
||||
SLRUs include pg_clog, pg_multixact/members, and
|
||||
pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
or we do not support them in zenith yet (pg_commit_ts).
|
||||
Each SLRU segment is considered a separate relish[].
|
||||
|
||||
### Tenant (Multitenancy)
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
|
||||
145
docs/rfcs/014-storage-lsm.md
Normal file
145
docs/rfcs/014-storage-lsm.md
Normal file
@@ -0,0 +1,145 @@
|
||||
# Why LSM trees?
|
||||
|
||||
In general, an LSM tree has the nice property that random updates are
|
||||
fast, but the disk writes are sequential. When a new file is created,
|
||||
it is immutable. New files are created and old ones are deleted, but
|
||||
existing files are never modified. That fits well with storing the
|
||||
files on S3.
|
||||
|
||||
Currently, we create a lot of small files. That is mostly a problem
|
||||
with S3, because each GET/PUT operation is expensive, and LIST
|
||||
operation only returns 1000 objects at a time, and isn't free
|
||||
either. Currently, the files are "archived" together into larger
|
||||
checkpoint files before they're uploaded to S3 to alleviate that
|
||||
problem, but garbage collecting data from the archive files would be
|
||||
difficult and we have not implemented it. This proposal addresses that
|
||||
problem.
|
||||
|
||||
|
||||
# Overview
|
||||
|
||||
|
||||
```
|
||||
^ LSN
|
||||
|
|
||||
| Memtable: +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
|
|
||||
| L0: +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
| +-----------------------------+
|
||||
| | |
|
||||
| +-----------------------------+
|
||||
|
|
||||
|
|
||||
| L1: +-------+ +-----+ +--+ +-+
|
||||
| | | | | | | | |
|
||||
| | | | | | | | |
|
||||
| +-------+ +-----+ +--+ +-+
|
||||
|
|
||||
| +----+ +-----+ +--+ +----+
|
||||
| | | | | | | | |
|
||||
| | | | | | | | |
|
||||
| +----+ +-----+ +--+ +----+
|
||||
|
|
||||
+--------------------------------------------------------------> Page ID
|
||||
|
||||
|
||||
+---+
|
||||
| | Layer file
|
||||
+---+
|
||||
```
|
||||
|
||||
|
||||
# Memtable
|
||||
|
||||
When new WAL arrives, it is first put into the Memtable. Despite the
|
||||
name, the Memtable is not a purely in-memory data structure. It can
|
||||
spill to a temporary file on disk if the system is low on memory, and
|
||||
is accessed through a buffer cache.
|
||||
|
||||
If the page server crashes, the Memtable is lost. It is rebuilt by
|
||||
processing again the WAL that's newer than the latest layer in L0.
|
||||
|
||||
The size of the Memtable is configured by the "checkpoint distance"
|
||||
setting. Because anything that hasn't been flushed to disk and
|
||||
uploaded to S3 yet needs to be kept in the safekeeper, the "checkpoint
|
||||
distance" also determines the amount of WAL that needs to kept in the
|
||||
safekeeper.
|
||||
|
||||
# L0
|
||||
|
||||
When the Memtable fills up, it is written out to a new file in L0. The
|
||||
files are immutable; when a file is created, it is never
|
||||
modified. Each file in L0 is roughly 1 GB in size (*). Like the
|
||||
Memtable, each file in L0 covers the whole key range.
|
||||
|
||||
When enough files have been accumulated in L0, compaction
|
||||
starts. Compaction processes all the files in L0 and reshuffles the
|
||||
data to create a new set of files in L1.
|
||||
|
||||
|
||||
(*) except in corner cases like if we want to shut down the page
|
||||
server and want to flush out the memtable to disk even though it's not
|
||||
full yet.
|
||||
|
||||
|
||||
# L1
|
||||
|
||||
L1 consists of ~ 1 GB files like L0. But each file covers only part of
|
||||
the overall key space, and a larger range of LSNs. This speeds up
|
||||
searches. When you're looking for a given page, you need to check all
|
||||
the files in L0, to see if they contain a page version for the requested
|
||||
page. But in L1, you only need to check the files whose key range covers
|
||||
the requested page. This is particularly important at cold start, when
|
||||
checking a file means downloading it from S3.
|
||||
|
||||
Partitioning by key range also helps with garbage collection. If only a
|
||||
part of the database is updated, we will accumulate more files for
|
||||
the hot part in L1, and old files can be removed without affecting the
|
||||
cold part.
|
||||
|
||||
|
||||
# Image layers
|
||||
|
||||
So far, we've only talked about delta layers. In addition to the delta
|
||||
layers, we create image layers, when "enough" WAL has been accumulated
|
||||
for some part of the database. Each image layer covers a 1 GB range of
|
||||
key space. It contains images of the pages at a single LSN, a snapshot
|
||||
if you will.
|
||||
|
||||
The exact heuristic for what "enough" means is not clear yet. Maybe
|
||||
create a new image layer when 10 GB of WAL has been accumulated for a
|
||||
1 GB segment.
|
||||
|
||||
The image layers limit the number of layers that a search needs to
|
||||
check. That put a cap on read latency, and it also allows garbage
|
||||
collecting layers that are older than the GC horizon.
|
||||
|
||||
|
||||
# Partitioning scheme
|
||||
|
||||
When compaction happens and creates a new set of files in L1, how do
|
||||
we partition the data into the files?
|
||||
|
||||
- Goal is that each file is ~ 1 GB in size
|
||||
- Try to match partition boundaries at relation boundaries. (See [1]
|
||||
for how PebblesDB does this, and for why that's important)
|
||||
- Greedy algorithm
|
||||
|
||||
# Additional Reading
|
||||
|
||||
[1] Paper on PebblesDB and how it does partitioning.
|
||||
https://www.cs.utexas.edu/~rak/papers/sosp17-pebblesdb.pdf
|
||||
@@ -68,11 +68,11 @@ S3.
|
||||
|
||||
The unit is # of bytes.
|
||||
|
||||
#### checkpoint_period
|
||||
#### compaction_period
|
||||
|
||||
The pageserver checks whether `checkpoint_distance` has been reached
|
||||
every `checkpoint_period` seconds. Default is 1 s, which should be
|
||||
fine.
|
||||
Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### gc_horizon
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ bytes = { version = "1.0.1", features = ['serde'] }
|
||||
byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "3.0"
|
||||
|
||||
@@ -20,8 +20,9 @@ use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::reltag::SlruKind;
|
||||
use crate::repository::Timeline;
|
||||
use crate::DatadirTimelineImpl;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
@@ -31,7 +32,7 @@ use zenith_utils::lsn::Lsn;
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
@@ -46,7 +47,7 @@ pub struct Basebackup<'a> {
|
||||
impl<'a> Basebackup<'a> {
|
||||
pub fn new(
|
||||
write: &'a mut dyn Write,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
req_lsn: Option<Lsn>,
|
||||
) -> Result<Basebackup<'a>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
@@ -64,13 +65,13 @@ impl<'a> Basebackup<'a> {
|
||||
// prev_lsn to Lsn(0) if we cannot provide the correct value.
|
||||
let (backup_prev, backup_lsn) = if let Some(req_lsn) = req_lsn {
|
||||
// Backup was requested at a particular LSN. Wait for it to arrive.
|
||||
timeline.wait_lsn(req_lsn)?;
|
||||
timeline.tline.wait_lsn(req_lsn)?;
|
||||
|
||||
// If the requested point is the end of the timeline, we can
|
||||
// provide prev_lsn. (get_last_record_rlsn() might return it as
|
||||
// zero, though, if no WAL has been generated on this timeline
|
||||
// yet.)
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
let end_of_timeline = timeline.tline.get_last_record_rlsn();
|
||||
if req_lsn == end_of_timeline.last {
|
||||
(end_of_timeline.prev, req_lsn)
|
||||
} else {
|
||||
@@ -78,7 +79,7 @@ impl<'a> Basebackup<'a> {
|
||||
}
|
||||
} else {
|
||||
// Backup was requested at end of the timeline.
|
||||
let end_of_timeline = timeline.get_last_record_rlsn();
|
||||
let end_of_timeline = timeline.tline.get_last_record_rlsn();
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
@@ -115,21 +116,24 @@ impl<'a> Basebackup<'a> {
|
||||
}
|
||||
|
||||
// Gather non-relational files from object storage pages.
|
||||
for obj in self.timeline.list_nonrels(self.lsn)? {
|
||||
match obj {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
self.add_slru_segment(slru, segno)?;
|
||||
}
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
self.add_relmap_file(spcnode, dbnode)?;
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
_ => {}
|
||||
for kind in [
|
||||
SlruKind::Clog,
|
||||
SlruKind::MultiXactOffsets,
|
||||
SlruKind::MultiXactMembers,
|
||||
] {
|
||||
for segno in self.timeline.list_slru_segments(kind, self.lsn)? {
|
||||
self.add_slru_segment(kind, segno)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create tablespace directories
|
||||
for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn)? {
|
||||
self.add_dbdir(spcnode, dbnode, has_relmap_file)?;
|
||||
}
|
||||
for xid in self.timeline.list_twophase_files(self.lsn)? {
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
@@ -141,28 +145,14 @@ impl<'a> Basebackup<'a> {
|
||||
// Generate SLRU segment files from repository.
|
||||
//
|
||||
fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
|
||||
let seg_size = self
|
||||
.timeline
|
||||
.get_relish_size(RelishTag::Slru { slru, segno }, self.lsn)?;
|
||||
|
||||
let nblocks = match seg_size {
|
||||
Some(seg_size) => seg_size,
|
||||
None => {
|
||||
trace!(
|
||||
"SLRU segment {}/{:>04X} was truncated",
|
||||
slru.to_str(),
|
||||
segno
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
let nblocks = self.timeline.get_slru_segment_size(slru, segno, self.lsn)?;
|
||||
|
||||
let mut slru_buf: Vec<u8> =
|
||||
Vec::with_capacity(nblocks as usize * pg_constants::BLCKSZ as usize);
|
||||
for blknum in 0..nblocks {
|
||||
let img =
|
||||
self.timeline
|
||||
.get_page_at_lsn(RelishTag::Slru { slru, segno }, blknum, self.lsn)?;
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
slru_buf.extend_from_slice(&img);
|
||||
@@ -177,16 +167,26 @@ impl<'a> Basebackup<'a> {
|
||||
}
|
||||
|
||||
//
|
||||
// Extract pg_filenode.map files from repository
|
||||
// Along with them also send PG_VERSION for each database.
|
||||
// Include database/tablespace directories.
|
||||
//
|
||||
fn add_relmap_file(&mut self, spcnode: u32, dbnode: u32) -> anyhow::Result<()> {
|
||||
let img = self.timeline.get_page_at_lsn(
|
||||
RelishTag::FileNodeMap { spcnode, dbnode },
|
||||
0,
|
||||
self.lsn,
|
||||
)?;
|
||||
let path = if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
// Each directory contains a PG_VERSION file, and the default database
|
||||
// directories also contain pg_filenode.map files.
|
||||
//
|
||||
fn add_dbdir(
|
||||
&mut self,
|
||||
spcnode: u32,
|
||||
dbnode: u32,
|
||||
has_relmap_file: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let relmap_img = if has_relmap_file {
|
||||
let img = self.timeline.get_relmap_file(spcnode, dbnode, self.lsn)?;
|
||||
ensure!(img.len() == 512);
|
||||
Some(img)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
@@ -194,8 +194,32 @@ impl<'a> Basebackup<'a> {
|
||||
let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
|
||||
String::from("global/pg_filenode.map") // filenode map for global tablespace
|
||||
if let Some(img) = relmap_img {
|
||||
// filenode map for global tablespace
|
||||
let header = new_tar_header("global/pg_filenode.map", img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
} else {
|
||||
warn!("global/pg_filenode.map is missing");
|
||||
}
|
||||
} else {
|
||||
// User defined tablespaces are not supported. However, as
|
||||
// a special case, if a tablespace/db directory is
|
||||
// completely empty, we can leave it out altogether. This
|
||||
// makes taking a base backup after the 'tablespace'
|
||||
// regression test pass, because the test drops the
|
||||
// created tablespaces after the tests.
|
||||
//
|
||||
// FIXME: this wouldn't be necessary, if we handled
|
||||
// XLOG_TBLSPC_DROP records. But we probably should just
|
||||
// throw an error on CREATE TABLESPACE in the first place.
|
||||
if !has_relmap_file
|
||||
&& self
|
||||
.timeline
|
||||
.list_rels(spcnode, dbnode, self.lsn)?
|
||||
.is_empty()
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
// User defined tablespaces are not supported
|
||||
ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
|
||||
@@ -204,16 +228,17 @@ impl<'a> Basebackup<'a> {
|
||||
let header = new_tar_header_dir(&path)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let version_bytes = pg_constants::PG_MAJORVERSION.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
|
||||
format!("base/{}/pg_filenode.map", dbnode)
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
}
|
||||
};
|
||||
ensure!(img.len() == 512);
|
||||
let header = new_tar_header(&path, img.len() as u64)?;
|
||||
self.ar.append(&header, &img[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -221,9 +246,7 @@ impl<'a> Basebackup<'a> {
|
||||
// Extract twophase state files
|
||||
//
|
||||
fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::TwoPhase { xid }, 0, self.lsn)?;
|
||||
let img = self.timeline.get_twophase_file(xid, self.lsn)?;
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(&img[..]);
|
||||
@@ -243,11 +266,11 @@ impl<'a> Basebackup<'a> {
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
|
||||
.get_checkpoint(self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
|
||||
.get_control_file(self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
@@ -268,7 +291,7 @@ impl<'a> Basebackup<'a> {
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
if self.lsn == self.timeline.tline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
|
||||
@@ -20,7 +20,7 @@ use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service,
|
||||
remote_storage::{self, SyncStartupData},
|
||||
repository::TimelineSyncStatusUpdate,
|
||||
repository::{Repository, TimelineSyncStatusUpdate},
|
||||
tenant_mgr, thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
timelines, virtual_file, LOG_FILE_NAME,
|
||||
|
||||
@@ -31,7 +31,8 @@ pub mod defaults {
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: &str = "1 s";
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
@@ -57,7 +58,7 @@ pub mod defaults {
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#checkpoint_period = '{DEFAULT_CHECKPOINT_PERIOD}'
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
@@ -91,7 +92,9 @@ pub struct PageServerConf {
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
pub checkpoint_distance: u64,
|
||||
pub checkpoint_period: Duration,
|
||||
|
||||
// How often to check if there's compaction work to be done.
|
||||
pub compaction_period: Duration,
|
||||
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
@@ -145,7 +148,8 @@ struct PageServerConfigBuilder {
|
||||
listen_http_addr: BuilderValue<String>,
|
||||
|
||||
checkpoint_distance: BuilderValue<u64>,
|
||||
checkpoint_period: BuilderValue<Duration>,
|
||||
|
||||
compaction_period: BuilderValue<Duration>,
|
||||
|
||||
gc_horizon: BuilderValue<u64>,
|
||||
gc_period: BuilderValue<Duration>,
|
||||
@@ -179,8 +183,8 @@ impl Default for PageServerConfigBuilder {
|
||||
listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
|
||||
checkpoint_period: Set(humantime::parse_duration(DEFAULT_CHECKPOINT_PERIOD)
|
||||
.expect("cannot parse default checkpoint period")),
|
||||
compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period")),
|
||||
gc_horizon: Set(DEFAULT_GC_HORIZON),
|
||||
gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period")),
|
||||
@@ -216,8 +220,8 @@ impl PageServerConfigBuilder {
|
||||
self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn checkpoint_period(&mut self, checkpoint_period: Duration) {
|
||||
self.checkpoint_period = BuilderValue::Set(checkpoint_period)
|
||||
pub fn compaction_period(&mut self, compaction_period: Duration) {
|
||||
self.compaction_period = BuilderValue::Set(compaction_period)
|
||||
}
|
||||
|
||||
pub fn gc_horizon(&mut self, gc_horizon: u64) {
|
||||
@@ -286,9 +290,9 @@ impl PageServerConfigBuilder {
|
||||
checkpoint_distance: self
|
||||
.checkpoint_distance
|
||||
.ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
|
||||
checkpoint_period: self
|
||||
.checkpoint_period
|
||||
.ok_or(anyhow::anyhow!("missing checkpoint_period"))?,
|
||||
compaction_period: self
|
||||
.compaction_period
|
||||
.ok_or(anyhow::anyhow!("missing compaction_period"))?,
|
||||
gc_horizon: self
|
||||
.gc_horizon
|
||||
.ok_or(anyhow::anyhow!("missing gc_horizon"))?,
|
||||
@@ -337,10 +341,10 @@ pub struct RemoteStorageConfig {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
@@ -425,7 +429,7 @@ impl PageServerConf {
|
||||
"listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
|
||||
"listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
|
||||
"checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
|
||||
"checkpoint_period" => builder.checkpoint_period(parse_toml_duration(key, item)?),
|
||||
"compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
|
||||
"gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
|
||||
"gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
|
||||
"wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
|
||||
@@ -561,7 +565,7 @@ impl PageServerConf {
|
||||
PageServerConf {
|
||||
id: ZNodeId(0),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
compaction_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
@@ -631,7 +635,8 @@ listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
|
||||
checkpoint_distance = 111 # in bytes
|
||||
checkpoint_period = '111 s'
|
||||
|
||||
compaction_period = '111 s'
|
||||
|
||||
gc_period = '222 s'
|
||||
gc_horizon = 222
|
||||
@@ -668,7 +673,7 @@ id = 10
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
checkpoint_period: humantime::parse_duration(defaults::DEFAULT_CHECKPOINT_PERIOD)?,
|
||||
compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
|
||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||
@@ -712,7 +717,7 @@ id = 10
|
||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
checkpoint_distance: 111,
|
||||
checkpoint_period: Duration::from_secs(111),
|
||||
compaction_period: Duration::from_secs(111),
|
||||
gc_horizon: 222,
|
||||
gc_period: Duration::from_secs(222),
|
||||
wait_lsn_timeout: Duration::from_secs(111),
|
||||
|
||||
@@ -22,6 +22,7 @@ use super::models::{
|
||||
StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
|
||||
};
|
||||
use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
|
||||
use crate::repository::Repository;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
|
||||
|
||||
@@ -162,8 +163,11 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
|
||||
@@ -11,14 +11,15 @@ use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Repository;
|
||||
use crate::walingest::WalIngest;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
@@ -27,42 +28,47 @@ use zenith_utils::lsn::Lsn;
|
||||
/// This is currently only used to import a cluster freshly created by initdb.
|
||||
/// The code that deals with the checkpoint would not work right if the
|
||||
/// cluster was not shut down cleanly.
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
||||
path: &Path,
|
||||
writer: &dyn TimelineWriter,
|
||||
tline: &mut DatadirTimeline<R>,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
// Scan 'global'
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
|
||||
pg_control = Some(import_control_file(&mut modification, &direntry.path())?);
|
||||
}
|
||||
Some("pg_filenode.map") => {
|
||||
import_relmap_file(
|
||||
&mut modification,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
dbnode: 0,
|
||||
},
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?,
|
||||
// Load any relation files into the page server (but only after the other files)
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(
|
||||
&mut modification,
|
||||
&relfile,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
@@ -76,54 +82,56 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
let dboid = direntry.file_name().to_string_lossy().parse::<u32>()?;
|
||||
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dbnode: dboid,
|
||||
},
|
||||
Some("PG_VERSION") => {
|
||||
//modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_relmap_file(
|
||||
&mut modification,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?,
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(
|
||||
&mut modification,
|
||||
&relfile,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_twophase_file(&mut modification, xid, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
// We're done importing all the data files.
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
modification.commit()?;
|
||||
|
||||
// We expect the Postgres server to be shut down cleanly.
|
||||
let pg_control = pg_control.context("pg_control file not found")?;
|
||||
@@ -141,7 +149,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'.
|
||||
import_wal(
|
||||
&path.join("pg_wal"),
|
||||
writer,
|
||||
tline,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
lsn,
|
||||
)?;
|
||||
@@ -150,10 +158,9 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
fn import_relfile<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
path: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -169,26 +176,35 @@ fn import_relfile(
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
if segno != 0 {
|
||||
todo!();
|
||||
}
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
modification.put_rel_creation(rel, nblocks as u32)?;
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
dbnode: dboid,
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
let tag = RelishTag::Relation(rel);
|
||||
timeline.put_page_image(tag, blknum, lsn, Bytes::copy_from_slice(&buf))?;
|
||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
ensure!(blknum == nblocks as u32, "unexpected EOF");
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
@@ -202,16 +218,28 @@ fn import_relfile(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Import a "non-blocky" file into the repository
|
||||
///
|
||||
/// This is used for small files like the control file, twophase files etc. that
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
/// Import a relmapper (pg_filenode.map) file into the repository
|
||||
fn import_relmap_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing relmap file {}", path.display());
|
||||
|
||||
modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Import a twophase state file (pg_twophase/<xid>) into the repository
|
||||
fn import_twophase_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
xid: TransactionId,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
@@ -221,7 +249,7 @@ fn import_nonrel_file(
|
||||
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -230,9 +258,8 @@ fn import_nonrel_file(
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
fn import_control_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
path: &Path,
|
||||
) -> Result<ControlFileData> {
|
||||
let mut file = File::open(path)?;
|
||||
@@ -243,17 +270,12 @@ fn import_control_file(
|
||||
trace!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
RelishTag::ControlFile,
|
||||
0,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buffer[..]),
|
||||
)?;
|
||||
modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
}
|
||||
@@ -261,28 +283,34 @@ fn import_control_file(
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
fn import_slru_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
trace!("importing slru file {}", path.display());
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?;
|
||||
|
||||
trace!("importing slru file {}", path.display());
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
|
||||
|
||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
timeline.put_page_image(
|
||||
RelishTag::Slru { slru, segno },
|
||||
modification.put_slru_page_image(
|
||||
slru,
|
||||
segno,
|
||||
rpageno,
|
||||
lsn,
|
||||
Bytes::copy_from_slice(&buf),
|
||||
)?;
|
||||
}
|
||||
@@ -291,7 +319,7 @@ fn import_slru_file(
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
ensure!(rpageno == nblocks as u32, "unexpected EOF");
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
@@ -300,8 +328,6 @@ fn import_slru_file(
|
||||
},
|
||||
};
|
||||
rpageno += 1;
|
||||
|
||||
// TODO: Check that the file isn't unexpectedly large, not larger than SLRU_PAGES_PER_SEGMENT pages
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -309,9 +335,9 @@ fn import_slru_file(
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(
|
||||
fn import_wal<R: Repository>(
|
||||
walpath: &Path,
|
||||
writer: &dyn TimelineWriter,
|
||||
tline: &mut DatadirTimeline<R>,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
) -> Result<()> {
|
||||
@@ -321,7 +347,7 @@ fn import_wal(
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
let mut walingest = WalIngest::new(writer.deref(), startpoint)?;
|
||||
let mut walingest = WalIngest::new(tline, startpoint)?;
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
@@ -354,7 +380,7 @@ fn import_wal(
|
||||
let mut nrecords = 0;
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest.ingest_record(writer, recdata, lsn)?;
|
||||
walingest.ingest_record(tline, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
|
||||
nrecords += 1;
|
||||
|
||||
134
pageserver/src/keyspace.rs
Normal file
134
pageserver/src/keyspace.rs
Normal file
@@ -0,0 +1,134 @@
|
||||
use crate::repository::{key_range_size, singleton_range, Key};
|
||||
use postgres_ffi::pg_constants;
|
||||
use std::ops::Range;
|
||||
|
||||
// Target file size, when creating image and delta layers
|
||||
pub const TARGET_FILE_SIZE_BYTES: u64 = 128 * 1024 * 1024; // 128 MB
|
||||
|
||||
///
|
||||
/// Represents a set of Keys, in a compact form.
|
||||
///
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct KeySpace {
|
||||
/// Contiguous ranges of keys that belong to the key space. In key order,
|
||||
/// and with no overlap.
|
||||
pub ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpace {
|
||||
///
|
||||
/// Partition a key space into roughly chunks of roughly 'target_size' bytes
|
||||
/// in each patition.
|
||||
///
|
||||
pub fn partition(&self, target_size: u64) -> KeyPartitioning {
|
||||
// Assume that each value is 8k in size.
|
||||
let target_nblocks = (target_size / pg_constants::BLCKSZ as u64) as usize;
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut current_part = Vec::new();
|
||||
let mut current_part_size: usize = 0;
|
||||
for range in &self.ranges {
|
||||
// If appending the next contiguous range in the keyspace to the current
|
||||
// partition would cause it to be too large, start a new partition.
|
||||
let this_size = key_range_size(range) as usize;
|
||||
if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
current_part = Vec::new();
|
||||
current_part_size = 0;
|
||||
}
|
||||
|
||||
// If the next range is larger than 'target_size', split it into
|
||||
// 'target_size' chunks.
|
||||
let mut remain_size = this_size;
|
||||
let mut start = range.start;
|
||||
while remain_size > target_nblocks {
|
||||
let next = start.add(target_nblocks as u32);
|
||||
parts.push(KeySpace {
|
||||
ranges: vec![start..next],
|
||||
});
|
||||
start = next;
|
||||
remain_size -= target_nblocks
|
||||
}
|
||||
current_part.push(start..range.end);
|
||||
current_part_size += remain_size;
|
||||
}
|
||||
|
||||
// add last partition that wasn't full yet.
|
||||
if !current_part.is_empty() {
|
||||
parts.push(KeySpace {
|
||||
ranges: current_part,
|
||||
});
|
||||
}
|
||||
|
||||
KeyPartitioning { parts }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents a partitioning of the key space.
|
||||
///
|
||||
/// The only kind of partitioning we do is to partition the key space into
|
||||
/// partitions that are roughly equal in physical size (see KeySpace::partition).
|
||||
/// But this data structure could represent any partitioning.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeyPartitioning {
|
||||
pub parts: Vec<KeySpace>,
|
||||
}
|
||||
|
||||
impl KeyPartitioning {
|
||||
pub fn new() -> Self {
|
||||
KeyPartitioning { parts: Vec::new() }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A helper object, to collect a set of keys and key ranges into a KeySpace
|
||||
/// object. This takes care of merging adjacent keys and key ranges into
|
||||
/// contiguous ranges.
|
||||
///
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct KeySpaceAccum {
|
||||
accum: Option<Range<Key>>,
|
||||
|
||||
ranges: Vec<Range<Key>>,
|
||||
}
|
||||
|
||||
impl KeySpaceAccum {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
accum: None,
|
||||
ranges: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_key(&mut self, key: Key) {
|
||||
self.add_range(singleton_range(key))
|
||||
}
|
||||
|
||||
pub fn add_range(&mut self, range: Range<Key>) {
|
||||
match self.accum.as_mut() {
|
||||
Some(accum) => {
|
||||
if range.start == accum.end {
|
||||
accum.end = range.end;
|
||||
} else {
|
||||
assert!(range.start > accum.end);
|
||||
self.ranges.push(accum.clone());
|
||||
*accum = range;
|
||||
}
|
||||
}
|
||||
None => self.accum = Some(range),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_keyspace(mut self) -> KeySpace {
|
||||
if let Some(accum) = self.accum.take() {
|
||||
self.ranges.push(accum);
|
||||
}
|
||||
KeySpace {
|
||||
ranges: self.ranges,
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,40 +1,42 @@
|
||||
# Overview
|
||||
|
||||
The on-disk format is based on immutable files. The page server receives a
|
||||
stream of incoming WAL, parses the WAL records to determine which pages they
|
||||
apply to, and accumulates the incoming changes in memory. Every now and then,
|
||||
the accumulated changes are written out to new immutable files. This process is
|
||||
called checkpointing. Old versions of on-disk files that are not needed by any
|
||||
timeline are removed by GC process.
|
||||
|
||||
The main responsibility of the Page Server is to process the incoming WAL, and
|
||||
reprocess it into a format that allows reasonably quick access to any page
|
||||
version.
|
||||
version. The page server slices the incoming WAL per relation and page, and
|
||||
packages the sliced WAL into suitably-sized "layer files". The layer files
|
||||
contain all the history of the database, back to some reasonable retention
|
||||
period. This system replaces the base backups and the WAL archive used in a
|
||||
traditional PostgreSQL installation. The layer files are immutable, they are not
|
||||
modified in-place after creation. New layer files are created for new incoming
|
||||
WAL, and old layer files are removed when they are no longer needed.
|
||||
|
||||
The on-disk format is based on immutable files. The page server receives a
|
||||
stream of incoming WAL, parses the WAL records to determine which pages they
|
||||
apply to, and accumulates the incoming changes in memory. Whenever enough WAL
|
||||
has been accumulated in memory, it is written out to a new immutable file. That
|
||||
process accumulates "L0 delta files" on disk. When enough L0 files have been
|
||||
accumulated, they are merged and re-partitioned into L1 files, and old files
|
||||
that are no longer needed are removed by Garbage Collection (GC).
|
||||
|
||||
The incoming WAL contains updates to arbitrary pages in the system. The
|
||||
distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between. The page server slices the
|
||||
incoming WAL per relation and page, and packages the sliced WAL into
|
||||
suitably-sized "layer files". The layer files contain all the history of the
|
||||
database, back to some reasonable retention period. This system replaces the
|
||||
base backups and the WAL archive used in a traditional PostgreSQL
|
||||
installation. The layer files are immutable, they are not modified in-place
|
||||
after creation. New layer files are created for new incoming WAL, and old layer
|
||||
files are removed when they are no longer needed. We could also replace layer
|
||||
files with new files that contain the same information, merging small files for
|
||||
example, but that hasn't been implemented yet.
|
||||
loaded, for example, or something in between.
|
||||
|
||||
Cloud Storage Page Server Safekeeper
|
||||
L1 L0 Memory WAL
|
||||
|
||||
Cloud Storage Page Server Safekeeper
|
||||
Local disk Memory WAL
|
||||
|
||||
|AAAA| |AAAA|AAAA| |AA
|
||||
|BBBB| |BBBB|BBBB| |
|
||||
|CCCC|CCCC| <---- |CCCC|CCCC|CCCC| <--- |CC <---- ADEBAABED
|
||||
|DDDD|DDDD| |DDDD|DDDD| |DDD
|
||||
|EEEE| |EEEE|EEEE|EEEE| |E
|
||||
|
||||
+----+ +----+----+
|
||||
|AAAA| |AAAA|AAAA| +---+-----+ |
|
||||
+----+ +----+----+ | | | |AA
|
||||
|BBBB| |BBBB|BBBB| |BB | AA | |BB
|
||||
+----+----+ +----+----+ |C | BB | |CC
|
||||
|CCCC|CCCC| <---- |CCCC|CCCC| <--- |D | CC | <--- |DDD <---- ADEBAABED
|
||||
+----+----+ +----+----+ | | DDD | |E
|
||||
|DDDD|DDDD| |DDDD|DDDD| |E | | |
|
||||
+----+----+ +----+----+ | | |
|
||||
|EEEE| |EEEE|EEEE| +---+-----+
|
||||
+----+ +----+----+
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
@@ -42,39 +44,29 @@ memory. The page server memory can be thought of as a quick "reorder buffer",
|
||||
used to hold the incoming WAL and reorder it so that we keep the WAL records for
|
||||
the same page and relation close to each other.
|
||||
|
||||
From the page server memory, whenever enough WAL has been accumulated for one
|
||||
relation segment, it is moved to local disk, as a new layer file, and the memory
|
||||
is released.
|
||||
From the page server memory, whenever enough WAL has been accumulated, it is flushed
|
||||
to disk into a new L0 layer file, and the memory is released.
|
||||
|
||||
When enough L0 files have been accumulated, they are merged together rand sliced
|
||||
per key-space, producing a new set of files where each file contains a more
|
||||
narrow key range, but larger LSN range.
|
||||
|
||||
From the local disk, the layers are further copied to Cloud Storage, for
|
||||
long-term archival. After a layer has been copied to Cloud Storage, it can be
|
||||
removed from local disk, although we currently keep everything locally for fast
|
||||
access. If a layer is needed that isn't found locally, it is fetched from Cloud
|
||||
Storage and stored in local disk.
|
||||
|
||||
# Terms used in layered repository
|
||||
|
||||
- Relish - one PostgreSQL relation or similarly treated file.
|
||||
- Segment - one slice of a Relish that is stored in a LayeredTimeline.
|
||||
- Layer - specific version of a relish Segment in a range of LSNs.
|
||||
Storage and stored in local disk. L0 and L1 files are both uploaded to Cloud
|
||||
Storage.
|
||||
|
||||
# Layer map
|
||||
|
||||
The LayerMap tracks what layers exist for all the relishes in a timeline.
|
||||
|
||||
LayerMap consists of two data structures:
|
||||
- segs - All the layers keyed by segment tag
|
||||
- open_layers - data structure that hold all open layers ordered by oldest_pending_lsn for quick access during checkpointing. oldest_pending_lsn is the LSN of the oldest page version stored in this layer.
|
||||
|
||||
All operations that update InMemory Layers should update both structures to keep them up-to-date.
|
||||
|
||||
- LayeredTimeline - implements Timeline interface.
|
||||
|
||||
All methods of LayeredTimeline are aware of its ancestors and return data taking them into account.
|
||||
TODO: Are there any exceptions to this?
|
||||
For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
|
||||
including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.
|
||||
The LayerMap tracks what layers exist in a timeline.
|
||||
|
||||
Currently, the layer map is just a resizeable array (Vec). On a GetPage@LSN or
|
||||
other read request, the layer map scans through the array to find the right layer
|
||||
that contains the data for the requested page. The read-code in LayeredTimeline
|
||||
is aware of the ancestor, and returns data from the ancestor timeline if it's
|
||||
not found on the current timeline.
|
||||
|
||||
# Different kinds of layers
|
||||
|
||||
@@ -92,11 +84,11 @@ To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
|
||||
TODO: Clarify the difference between Closed, Historic and Frozen.
|
||||
|
||||
There are two kinds of OnDisk layers:
|
||||
- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
|
||||
relish segment.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
- ImageLayer represents a snapshot of all the keys in a particular range, at one
|
||||
particular LSN. Any keys that are not present in the ImageLayer are known not
|
||||
to exist at that LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of
|
||||
LSNs, for a range of keys.
|
||||
|
||||
# Layer life cycle
|
||||
|
||||
@@ -109,71 +101,71 @@ layer or a delta layer, it is a valid end bound. An image layer represents
|
||||
snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
|
||||
Every layer starts its life as an Open In-Memory layer. When the page server
|
||||
receives the first WAL record for a segment, it creates a new In-Memory layer
|
||||
for it, and puts it to the layer map. Later, the layer is old enough, its
|
||||
contents are written to disk, as On-Disk layers. This process is called
|
||||
"evicting" a layer.
|
||||
receives the first WAL record for a timeline, it creates a new In-Memory layer
|
||||
for it, and puts it to the layer map. Later, when the layer becomes full, its
|
||||
contents are written to disk, as an on-disk layers.
|
||||
|
||||
Layer eviction is a two-step process: First, the layer is marked as closed, so
|
||||
that it no longer accepts new WAL records, and the layer map is updated
|
||||
accordingly. If a new WAL record for that segment arrives after this step, a new
|
||||
Open layer is created to hold it. After this first step, the layer is a Closed
|
||||
Flushing a layer is a two-step process: First, the layer is marked as closed, so
|
||||
that it no longer accepts new WAL records, and a new in-memory layer is created
|
||||
to hold any WAL after that point. After this first step, the layer is a Closed
|
||||
InMemory state. This first step is called "freezing" the layer.
|
||||
|
||||
In the second step, new Delta and Image layers are created, containing all the
|
||||
data in the Frozen InMemory layer. When the new layers are ready, the original
|
||||
frozen layer is replaced with the new layers in the layer map, and the original
|
||||
frozen layer is dropped, releasing the memory.
|
||||
In the second step, a new Delta layers is created, containing all the data from
|
||||
the Frozen InMemory layer. When it has been created and flushed to disk, the
|
||||
original frozen layer is replaced with the new layers in the layer map, and the
|
||||
original frozen layer is dropped, releasing the memory.
|
||||
|
||||
# Layer files (On-disk layers)
|
||||
|
||||
The files are called "layer files". Each layer file corresponds
|
||||
to one RELISH_SEG_SIZE slice of a PostgreSQL relation fork or
|
||||
non-rel file in a range of LSNs. The layer files
|
||||
for each timeline are stored in the timeline's subdirectory under
|
||||
The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
|
||||
There are two kind of layer file: base images, and deltas. A base
|
||||
image file contains a layer of a segment as it was at one LSN,
|
||||
whereas a delta file contains modifications to a segment - mostly in
|
||||
the form of WAL records - in a range of LSN
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
contains modifications to a segment - mostly in the form of WAL records - in a
|
||||
range of LSN.
|
||||
|
||||
base image file:
|
||||
image file:
|
||||
|
||||
rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>
|
||||
000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
start key end key LSN
|
||||
|
||||
The first parts define the key range that the layer covers. See
|
||||
pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
|
||||
|
||||
delta file:
|
||||
|
||||
rel_<spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
|
||||
Delta files are named similarly, but they cover a range of LSNs:
|
||||
|
||||
For example:
|
||||
000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
start key end key start LSN end LSN
|
||||
|
||||
rel_1663_13990_2609_0_10_000000000169C348
|
||||
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
A delta file contains all the key-values in the key-range that were updated in
|
||||
the LSN range. If a key has not been modified, there is no trace of it in the
|
||||
delta layer.
|
||||
|
||||
In addition to the relations, with "rel_*" prefix, we use the same
|
||||
format for storing various smaller files from the PostgreSQL data
|
||||
directory. They will use different suffixes and the naming scheme up
|
||||
to the LSNs vary. The Zenith source code uses the term "relish" to
|
||||
mean "a relation, or other file that's treated like a relation in the
|
||||
storage" For example, a base image of a CLOG segment would be named
|
||||
like this:
|
||||
|
||||
pg_xact_0000_0_00000000198B06B0
|
||||
A delta layer file can cover a part of the overall key space, as in the previous
|
||||
example, or the whole key range like this:
|
||||
|
||||
There is no difference in how the relation and non-relation files are
|
||||
managed, except that the first part of file names is different.
|
||||
Internally, the relations and non-relation files that are managed in
|
||||
the versioned store are together called "relishes".
|
||||
000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
|
||||
|
||||
If a file has been dropped, the last layer file for it is created
|
||||
with the _DROPPED suffix, e.g.
|
||||
|
||||
rel_1663_13990_2609_0_10_000000000169C348_0000000001702000_DROPPED
|
||||
A file that covers the whole key range is called a L0 file (Level 0), while a
|
||||
file that covers only part of the key range is called a L1 file. The "level" of
|
||||
a file is not explicitly stored anywhere, you can only distinguish them by
|
||||
looking at the key range that a file covers. The read-path doesn't need to
|
||||
treat L0 and L1 files any differently.
|
||||
|
||||
|
||||
## Notation used in this document
|
||||
|
||||
FIXME: This is somewhat obsolete, the layer files cover a key-range rather than
|
||||
a particular relation nowadays. However, the description on how you find a page
|
||||
version, and how branching and GC works is still valid.
|
||||
|
||||
The full path of a delta file looks like this:
|
||||
|
||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
//!
|
||||
//! A DeltaLayer represents a collection of WAL records or page images in a range of
|
||||
//! LSNs, for one segment. It is stored on a file on disk.
|
||||
//! LSNs, and in a range of Keys. It is stored on a file on disk.
|
||||
//!
|
||||
//! Usually a delta layer only contains differences - in the form of WAL records against
|
||||
//! a base LSN. However, if a segment is newly created, by creating a new relation or
|
||||
@@ -11,84 +10,74 @@
|
||||
//! can happen when you create a new branch in the middle of a delta layer, and the WAL
|
||||
//! records on the new branch are put in a new delta layer.
|
||||
//!
|
||||
//! When a delta file needs to be accessed, we slurp the metadata and segsize chapters
|
||||
//! When a delta file needs to be accessed, we slurp the 'index' metadata
|
||||
//! into memory, into the DeltaLayerInner struct. See load() and unload() functions.
|
||||
//! To access a page/WAL record, we search `page_version_metas` for the block # and LSN.
|
||||
//! The byte ranges in the metadata can be used to find the page/WAL record in
|
||||
//! PAGE_VERSIONS_CHAPTER.
|
||||
//! To access a particular value, we search `index` for the given key.
|
||||
//! The byte offset in the index can be used to find the value in
|
||||
//! VALUES_CHAPTER.
|
||||
//!
|
||||
//! On disk, the delta files are stored in timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each delta file is named like this:
|
||||
//!
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<start LSN>_<end LSN>
|
||||
//! <key start>-<key end>__<start LSN>-<end LSN
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349
|
||||
//! 000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
//!
|
||||
//! If a relation is dropped, we add a '_DROPPED' to the end of the filename to indicate that.
|
||||
//! So the above example would become:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348_000000000169C349_DROPPED
|
||||
//!
|
||||
//! The end LSN indicates when it was dropped in that case, we don't store it in the
|
||||
//! file contents in any way.
|
||||
//!
|
||||
//! A detlta file is constructed using the 'bookfile' crate. Each file consists of two
|
||||
//! parts: the page versions and the segment sizes. They are stored as separate chapters.
|
||||
//! A delta file is constructed using the 'bookfile' crate. Each file consists of three
|
||||
//! parts: the 'index', the values, and a short summary header. They are stored as
|
||||
//! separate chapters.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::DELTA_FILE_MAGIC;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::io::BufWriter;
|
||||
use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard, TryLockError};
|
||||
|
||||
use bookfile::{Book, BookWriter, BoundedReader, ChapterWriter};
|
||||
use bookfile::{Book, BookWriter, ChapterWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Magic constant to identify a Zenith delta file
|
||||
pub const DELTA_FILE_MAGIC: u32 = 0x5A616E01;
|
||||
/// Mapping from (key, lsn) -> page/WAL record
|
||||
/// byte ranges in VALUES_CHAPTER
|
||||
static INDEX_CHAPTER: u64 = 1;
|
||||
|
||||
/// Mapping from (block #, lsn) -> page/WAL record
|
||||
/// byte ranges in PAGE_VERSIONS_CHAPTER
|
||||
static PAGE_VERSION_METAS_CHAPTER: u64 = 1;
|
||||
/// Page/WAL bytes - cannot be interpreted
|
||||
/// without PAGE_VERSION_METAS_CHAPTER
|
||||
static PAGE_VERSIONS_CHAPTER: u64 = 2;
|
||||
static SEG_SIZES_CHAPTER: u64 = 3;
|
||||
/// without the page versions from the INDEX_CHAPTER
|
||||
static VALUES_CHAPTER: u64 = 2;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
static SUMMARY_CHAPTER: u64 = 4;
|
||||
static SUMMARY_CHAPTER: u64 = 3;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
key_range: Range<Key>,
|
||||
lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl From<&DeltaLayer> for Summary {
|
||||
@@ -96,33 +85,17 @@ impl From<&DeltaLayer> for Summary {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
|
||||
start_lsn: layer.start_lsn,
|
||||
end_lsn: layer.end_lsn,
|
||||
|
||||
dropped: layer.dropped,
|
||||
key_range: layer.key_range.clone(),
|
||||
lsn_range: layer.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct BlobRange {
|
||||
offset: u64,
|
||||
size: usize,
|
||||
}
|
||||
|
||||
fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
/// file, in the LayerMap. If a layer is in "loaded" state, we have a
|
||||
/// copy of the file in memory, in 'inner'. Otherwise the struct is
|
||||
/// copy of the index in memory, in 'inner'. Otherwise the struct is
|
||||
/// just a placeholder for a file that exists on disk, and it needs to
|
||||
/// be loaded before using it in queries.
|
||||
///
|
||||
@@ -131,47 +104,24 @@ pub struct DeltaLayer {
|
||||
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
|
||||
//
|
||||
// This entry contains all the changes from 'start_lsn' to 'end_lsn'. The
|
||||
// start is inclusive, and end is exclusive.
|
||||
//
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
|
||||
dropped: bool,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
|
||||
inner: RwLock<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
pub struct DeltaLayerInner {
|
||||
/// If false, the 'page_version_metas' and 'seg_sizes' have not been
|
||||
/// loaded into memory yet.
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here.
|
||||
/// Indexed by block number and LSN. The value is an offset into the
|
||||
/// chapter where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, BlobRef>>,
|
||||
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
|
||||
/// `seg_sizes` tracks the size of the segment at different points in time.
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
}
|
||||
|
||||
impl DeltaLayerInner {
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk> {
|
||||
// Scan the VecMap backwards, starting from the given entry.
|
||||
let slice = self
|
||||
.seg_sizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
} else {
|
||||
bail!("could not find seg size in delta layer")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
@@ -183,132 +133,93 @@ impl Layer for DeltaLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
self.dropped
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
self.end_lsn
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
self.lsn_range.clone()
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> anyhow::Result<PageReconstructResult> {
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if &self.end_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
ensure!(self.key_range.contains(&key));
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
let inner = self.load()?;
|
||||
let page_version_reader = inner
|
||||
let values_reader = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
// Scan the metadata VecMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let iter = inner
|
||||
.page_version_metas
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if pv_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some((*pv_lsn, img));
|
||||
need_image = false;
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
let mut size = 0usize;
|
||||
let mut first_pos = 0u64;
|
||||
for (_entry_lsn, blob_ref) in slice.iter().rev() {
|
||||
size += blob_ref.size();
|
||||
first_pos = blob_ref.pos();
|
||||
if blob_ref.will_init() {
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
if size != 0 {
|
||||
let mut buf = vec![0u8; size];
|
||||
values_reader.read_exact_at(&mut buf, first_pos)?;
|
||||
for (entry_lsn, blob_ref) in slice.iter().rev() {
|
||||
let offs = (blob_ref.pos() - first_pos) as usize;
|
||||
let val = Value::des(&buf[offs..offs + blob_ref.size()])?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& self.seg.rel.is_blocky()
|
||||
&& blknum >= inner.get_seg_size(lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release metadata lock and close the file
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result<SegmentBlk> {
|
||||
ensure!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + '_> {
|
||||
let inner = self.load().unwrap();
|
||||
|
||||
let inner = self.load()?;
|
||||
inner.get_seg_size(lsn)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool> {
|
||||
// Is the requested LSN after the rel was dropped?
|
||||
if self.dropped && lsn >= self.end_lsn {
|
||||
return Ok(false);
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
Err(err) => Box::new(std::iter::once(Err(err))),
|
||||
}
|
||||
|
||||
// Otherwise, it exists.
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -316,13 +227,22 @@ impl Layer for DeltaLayer {
|
||||
/// it will need to be loaded back.
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
// FIXME: In debug mode, loading and unloading the index slows
|
||||
// things down so much that you get timeout errors. At least
|
||||
// with the test_parallel_copy test. So as an even more ad hoc
|
||||
// stopgap fix for that, only unload every on average 10
|
||||
// checkpoint cycles.
|
||||
use rand::RngCore;
|
||||
if rand::thread_rng().next_u32() > (u32::MAX / 10) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut inner = match self.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(TryLockError::WouldBlock) => return Ok(()),
|
||||
Err(TryLockError::Poisoned(_)) => panic!("DeltaLayer lock was poisoned"),
|
||||
};
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.seg_sizes = VecMap::default();
|
||||
inner.index = HashMap::default();
|
||||
inner.loaded = false;
|
||||
|
||||
// Note: we keep the Book open. Is that a good idea? The virtual file
|
||||
@@ -349,45 +269,52 @@ impl Layer for DeltaLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- delta layer for ten {} tli {} seg {} {}-{} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.start_lsn, self.end_lsn
|
||||
"----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
self.lsn_range.start,
|
||||
self.lsn_range.end
|
||||
);
|
||||
|
||||
println!("--- seg sizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.seg_sizes.as_slice() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
|
||||
let path = self.path();
|
||||
let file = std::fs::File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
let chapter = book.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
let mut values: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
|
||||
values.sort_by_key(|k| k.0);
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
for (key, versions) in values {
|
||||
for (lsn, blob_ref) in versions.as_slice() {
|
||||
let mut desc = String::new();
|
||||
let mut buf = vec![0u8; blob_ref.size()];
|
||||
chapter.read_exact_at(&mut buf, blob_ref.pos())?;
|
||||
let val = Value::des(&buf);
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
blob_range.size,
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -475,18 +402,13 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(SEG_SIZES_CHAPTER)?;
|
||||
let seg_sizes = VecMap::des(&chapter)?;
|
||||
let chapter = book.read_chapter(INDEX_CHAPTER)?;
|
||||
let index = HashMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.page_version_metas = page_version_metas;
|
||||
inner.seg_sizes = seg_sizes;
|
||||
inner.index = index;
|
||||
inner.loaded = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -501,15 +423,12 @@ impl DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
start_lsn: filename.start_lsn,
|
||||
end_lsn: filename.end_lsn,
|
||||
dropped: filename.dropped,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn_range: filename.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
index: HashMap::default(),
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -519,7 +438,7 @@ impl DeltaLayer {
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
F: FileExt,
|
||||
{
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
@@ -528,25 +447,20 @@ impl DeltaLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
start_lsn: summary.start_lsn,
|
||||
end_lsn: summary.end_lsn,
|
||||
dropped: summary.dropped,
|
||||
key_range: summary.key_range,
|
||||
lsn_range: summary.lsn_range,
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
index: HashMap::default(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -567,24 +481,24 @@ impl DeltaLayer {
|
||||
///
|
||||
/// 1. Create the DeltaLayerWriter by calling DeltaLayerWriter::new(...)
|
||||
///
|
||||
/// 2. Write the contents by calling `put_page_version` for every page
|
||||
/// 2. Write the contents by calling `put_value` for every page
|
||||
/// version to store in the layer.
|
||||
///
|
||||
/// 3. Call `finish`.
|
||||
///
|
||||
pub struct DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
|
||||
page_version_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
pv_offset: u64,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
|
||||
page_version_metas: VecMap<(SegmentBlk, Lsn), BlobRange>,
|
||||
index: HashMap<Key, VecMap<Lsn, BlobRef>>,
|
||||
|
||||
values_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
end_offset: u64,
|
||||
}
|
||||
|
||||
impl DeltaLayerWriter {
|
||||
@@ -595,94 +509,86 @@ impl DeltaLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
key_start: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
) -> Result<DeltaLayerWriter> {
|
||||
// Create the file
|
||||
// Create the file initially with a temporary filename. We don't know
|
||||
// the end key yet, so we cannot form the final filename yet. We will
|
||||
// rename it when we're done.
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&DeltaFileName {
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
},
|
||||
);
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end)
|
||||
));
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-versions chapter for writing. The calls to
|
||||
// `put_page_version` will use this to write the contents.
|
||||
let page_version_writer = book.new_chapter(PAGE_VERSIONS_CHAPTER);
|
||||
// `put_value` will use this to write the contents.
|
||||
let values_writer = book.new_chapter(VALUES_CHAPTER);
|
||||
|
||||
Ok(DeltaLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
page_version_writer,
|
||||
page_version_metas: VecMap::default(),
|
||||
pv_offset: 0,
|
||||
key_start,
|
||||
lsn_range,
|
||||
index: HashMap::new(),
|
||||
values_writer,
|
||||
end_offset: 0,
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a page version to the file.
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
/// 'buf' is a serialized PageVersion.
|
||||
/// The page versions must be appended in blknum, lsn order.
|
||||
/// The values must be appended in key, lsn order.
|
||||
///
|
||||
pub fn put_page_version(&mut self, blknum: SegmentBlk, lsn: Lsn, buf: &[u8]) -> Result<()> {
|
||||
pub fn put_value(&mut self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
//info!("DELTA: key {} at {} on {}", key, lsn, self.path.display());
|
||||
assert!(self.lsn_range.start <= lsn);
|
||||
// Remember the offset and size metadata. The metadata is written
|
||||
// to a separate chapter, in `finish`.
|
||||
let blob_range = BlobRange {
|
||||
offset: self.pv_offset,
|
||||
size: buf.len(),
|
||||
};
|
||||
self.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
|
||||
// write the page version
|
||||
self.page_version_writer.write_all(buf)?;
|
||||
self.pv_offset += buf.len() as u64;
|
||||
let off = self.end_offset;
|
||||
let buf = Value::ser(&val)?;
|
||||
let len = buf.len();
|
||||
self.values_writer.write_all(&buf)?;
|
||||
self.end_offset += len as u64;
|
||||
let vec_map = self.index.entry(key).or_default();
|
||||
let blob_ref = BlobRef::new(off, len, val.will_init());
|
||||
let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
bail!(
|
||||
"Value for {} at {} already exists in delta layer being built",
|
||||
key,
|
||||
lsn
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn size(&self) -> u64 {
|
||||
self.end_offset
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish writing the delta layer.
|
||||
///
|
||||
/// 'seg_sizes' is a list of size changes to store with the actual data.
|
||||
///
|
||||
pub fn finish(self, seg_sizes: VecMap<Lsn, SegmentBlk>) -> anyhow::Result<DeltaLayer> {
|
||||
// Close the page-versions chapter
|
||||
let book = self.page_version_writer.close()?;
|
||||
pub fn finish(self, key_end: Key) -> anyhow::Result<DeltaLayer> {
|
||||
// Close the values chapter
|
||||
let book = self.values_writer.close()?;
|
||||
|
||||
// Write out page versions metadata
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&self.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
if self.seg.rel.is_blocky() {
|
||||
ensure!(!seg_sizes.is_empty());
|
||||
}
|
||||
|
||||
// and seg_sizes to separate chapter
|
||||
let mut chapter = book.new_chapter(SEG_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&seg_sizes)?;
|
||||
// Write out the index
|
||||
let mut chapter = book.new_chapter(INDEX_CHAPTER);
|
||||
let buf = HashMap::ser(&self.index)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
@@ -690,12 +596,8 @@ impl DeltaLayerWriter {
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
|
||||
dropped: self.dropped,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
let book = chapter.close()?;
|
||||
@@ -710,20 +612,111 @@ impl DeltaLayerWriter {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
inner: RwLock::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
index: HashMap::new(),
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
seg_sizes: VecMap::default(),
|
||||
}),
|
||||
};
|
||||
|
||||
trace!("created delta layer {}", &layer.path().display());
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = DeltaLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
key_range: self.key_start..key_end,
|
||||
lsn_range: self.lsn_range,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, &final_path)?;
|
||||
|
||||
trace!("created delta layer {}", final_path.display());
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
pub fn abort(self) {
|
||||
match self.values_writer.close() {
|
||||
Ok(book) => {
|
||||
if let Err(err) = book.close() {
|
||||
error!("error while closing delta layer file: {}", err);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("error while closing chapter writer: {}", err);
|
||||
}
|
||||
}
|
||||
if let Err(err) = std::fs::remove_file(self.path) {
|
||||
error!("error removing unfinished delta layer file: {}", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Iterator over all key-value pairse stored in a delta layer
|
||||
///
|
||||
/// FIXME: This creates a Vector to hold the offsets of all key value pairs.
|
||||
/// That takes up quite a lot of memory. Should do this in a more streaming
|
||||
/// fashion.
|
||||
///
|
||||
struct DeltaValueIter {
|
||||
all_offsets: Vec<(Key, Lsn, BlobRef)>,
|
||||
next_idx: usize,
|
||||
data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl Iterator for DeltaValueIter {
|
||||
type Item = Result<(Key, Lsn, Value)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.next_res().transpose()
|
||||
}
|
||||
}
|
||||
|
||||
impl DeltaValueIter {
|
||||
fn new(inner: RwLockReadGuard<DeltaLayerInner>) -> Result<Self> {
|
||||
let mut index: Vec<(&Key, &VecMap<Lsn, BlobRef>)> = inner.index.iter().collect();
|
||||
index.sort_by_key(|x| x.0);
|
||||
|
||||
let mut all_offsets: Vec<(Key, Lsn, BlobRef)> = Vec::new();
|
||||
for (key, vec_map) in index.iter() {
|
||||
for (lsn, blob_ref) in vec_map.as_slice().iter() {
|
||||
all_offsets.push((**key, *lsn, *blob_ref));
|
||||
}
|
||||
}
|
||||
|
||||
let values_reader = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
let file_size = values_reader.len() as usize;
|
||||
let mut layer = DeltaValueIter {
|
||||
all_offsets,
|
||||
next_idx: 0,
|
||||
data: vec![0u8; file_size],
|
||||
};
|
||||
values_reader.read_exact_at(&mut layer.data, 0)?;
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
fn next_res(&mut self) -> Result<Option<(Key, Lsn, Value)>> {
|
||||
if self.next_idx < self.all_offsets.len() {
|
||||
let (key, lsn, blob_ref) = self.all_offsets[self.next_idx];
|
||||
let offs = blob_ref.pos() as usize;
|
||||
let size = blob_ref.size();
|
||||
let val = Value::des(&self.data[offs..offs + size])?;
|
||||
self.next_idx += 1;
|
||||
Ok(Some((key, lsn, val)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,29 +2,52 @@
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::storage_layer::SegmentTag;
|
||||
use crate::relish::*;
|
||||
use crate::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub start_lsn: Lsn,
|
||||
pub end_lsn: Lsn,
|
||||
pub dropped: bool,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn_range: Range<Lsn>,
|
||||
}
|
||||
|
||||
impl PartialOrd for DeltaFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for DeltaFileName {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp;
|
||||
|
||||
cmp = self.key_range.start.cmp(&other.key_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.key_range.end.cmp(&other.key_range.end);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn_range.start.cmp(&other.lsn_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn_range.end.cmp(&other.lsn_range.end);
|
||||
|
||||
cmp
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the filename of a DeltaLayer
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>
|
||||
///
|
||||
/// or if it was dropped:
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<start LSN>_<end LSN>_DROPPED
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
///
|
||||
impl DeltaFileName {
|
||||
///
|
||||
@@ -32,234 +55,123 @@ impl DeltaFileName {
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
let mut lsn_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
let start_lsn = Lsn::from_hex(lsn_start_str).ok()?;
|
||||
let end_lsn = Lsn::from_hex(lsn_end_str).ok()?;
|
||||
|
||||
let start_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
let end_lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
let mut dropped = false;
|
||||
if let Some(suffix) = parts.next() {
|
||||
if suffix == "DROPPED" {
|
||||
dropped = true;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
if parts.next().is_some() {
|
||||
if start_lsn >= end_lsn {
|
||||
return None;
|
||||
// or panic?
|
||||
}
|
||||
|
||||
if key_start >= key_end {
|
||||
return None;
|
||||
// or panic?
|
||||
}
|
||||
|
||||
Some(DeltaFileName {
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn,
|
||||
dropped,
|
||||
key_range: key_start..key_end,
|
||||
lsn_range: start_lsn..end_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for DeltaFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}_{:016X}{}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
u64::from(self.start_lsn),
|
||||
u64::from(self.end_lsn),
|
||||
if self.dropped { "_DROPPED" } else { "" }
|
||||
"{}-{}__{:016X}-{:016X}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
u64::from(self.lsn_range.start),
|
||||
u64::from(self.lsn_range.end),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct ImageFileName {
|
||||
pub seg: SegmentTag,
|
||||
pub key_range: Range<Key>,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl PartialOrd for ImageFileName {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for ImageFileName {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp;
|
||||
|
||||
cmp = self.key_range.start.cmp(&other.key_range.start);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.key_range.end.cmp(&other.key_range.end);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.lsn.cmp(&other.lsn);
|
||||
|
||||
cmp
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Represents the filename of an ImageLayer
|
||||
///
|
||||
/// <spcnode>_<dbnode>_<relnode>_<forknum>_<seg>_<LSN>
|
||||
///
|
||||
/// <key start>-<key end>__<LSN>
|
||||
impl ImageFileName {
|
||||
///
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Relation(RelTag {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
relnode: parts.next()?.parse::<u32>().ok()?,
|
||||
forknum: parts.next()?.parse::<u8>().ok()?,
|
||||
});
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_xact_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_members_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_multixact_offsets_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno: u32::from_str_radix(parts.next()?, 16).ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_filenodemap_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::FileNodeMap {
|
||||
spcnode: parts.next()?.parse::<u32>().ok()?,
|
||||
dbnode: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_twophase_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::TwoPhase {
|
||||
xid: parts.next()?.parse::<u32>().ok()?,
|
||||
};
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_checkpoint_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::Checkpoint;
|
||||
} else if let Some(rest) = fname.strip_prefix("pg_control_") {
|
||||
parts = rest.split('_');
|
||||
rel = RelishTag::ControlFile;
|
||||
} else {
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_str = parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let segno = parts.next()?.parse::<u32>().ok()?;
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
|
||||
let seg = SegmentTag { rel, segno };
|
||||
let lsn = Lsn::from_hex(lsn_str).ok()?;
|
||||
|
||||
let lsn = Lsn::from_hex(parts.next()?).ok()?;
|
||||
|
||||
if parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(ImageFileName { seg, lsn })
|
||||
Some(ImageFileName {
|
||||
key_range: key_start..key_end,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for ImageFileName {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let basename = match self.seg.rel {
|
||||
RelishTag::Relation(reltag) => format!(
|
||||
"rel_{}_{}_{}_{}",
|
||||
reltag.spcnode, reltag.dbnode, reltag.relnode, reltag.forknum
|
||||
),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
} => format!("pg_xact_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
} => format!("pg_multixact_members_{:04X}", segno),
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
} => format!("pg_multixact_offsets_{:04X}", segno),
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
format!("pg_filenodemap_{}_{}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => format!("pg_twophase_{}", xid),
|
||||
RelishTag::Checkpoint => "pg_control_checkpoint".to_string(),
|
||||
RelishTag::ControlFile => "pg_control".to_string(),
|
||||
};
|
||||
|
||||
write!(
|
||||
f,
|
||||
"{}_{}_{:016X}",
|
||||
basename,
|
||||
self.seg.segno,
|
||||
"{}-{}__{:016X}",
|
||||
self.key_range.start,
|
||||
self.key_range.end,
|
||||
u64::from(self.lsn),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
//!
|
||||
//! Global registry of open layers.
|
||||
//!
|
||||
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
|
||||
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
|
||||
//! in-memory layers in the system, and know when we need to evict some to release
|
||||
//! memory.
|
||||
//!
|
||||
//! Each layer is assigned a unique ID when it's registered in the global registry.
|
||||
//! The ID can be used to relocate the layer later, without having to hold locks.
|
||||
//!
|
||||
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::inmemory_layer::InMemoryLayer;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
|
||||
RwLock::new(InMemoryLayers::default());
|
||||
}
|
||||
|
||||
// TODO these types can probably be smaller
|
||||
#[derive(PartialEq, Eq, Clone, Copy)]
|
||||
pub struct LayerId {
|
||||
index: usize,
|
||||
tag: u64, // to avoid ABA problem
|
||||
}
|
||||
|
||||
enum SlotData {
|
||||
Occupied(Arc<InMemoryLayer>),
|
||||
/// Vacant slots form a linked list, the value is the index
|
||||
/// of the next vacant slot in the list.
|
||||
Vacant(Option<usize>),
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
tag: u64,
|
||||
data: SlotData,
|
||||
usage_count: AtomicU8, // for clock algorithm
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct InMemoryLayers {
|
||||
slots: Vec<Slot>,
|
||||
num_occupied: usize,
|
||||
|
||||
// Head of free-slot list.
|
||||
next_empty_slot_idx: Option<usize>,
|
||||
}
|
||||
|
||||
impl InMemoryLayers {
|
||||
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let slot_idx = match self.next_empty_slot_idx {
|
||||
Some(slot_idx) => slot_idx,
|
||||
None => {
|
||||
let idx = self.slots.len();
|
||||
self.slots.push(Slot {
|
||||
tag: 0,
|
||||
data: SlotData::Vacant(None),
|
||||
usage_count: AtomicU8::new(0),
|
||||
});
|
||||
idx
|
||||
}
|
||||
};
|
||||
let slots_len = self.slots.len();
|
||||
|
||||
let slot = &mut self.slots[slot_idx];
|
||||
|
||||
match slot.data {
|
||||
SlotData::Occupied(_) => {
|
||||
panic!("an occupied slot was in the free list");
|
||||
}
|
||||
SlotData::Vacant(next_empty_slot_idx) => {
|
||||
self.next_empty_slot_idx = next_empty_slot_idx;
|
||||
}
|
||||
}
|
||||
|
||||
slot.data = SlotData::Occupied(layer);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
self.num_occupied += 1;
|
||||
assert!(self.num_occupied <= slots_len);
|
||||
|
||||
LayerId {
|
||||
index: slot_idx,
|
||||
tag: slot.tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
|
||||
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
|
||||
if slot.tag != layer_id.tag {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let SlotData::Occupied(layer) = &slot.data {
|
||||
let _ = slot.usage_count.fetch_update(
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
|old_usage_count| {
|
||||
if old_usage_count < MAX_USAGE_COUNT {
|
||||
Some(old_usage_count + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// TODO this won't be a public API in the future
|
||||
pub fn remove(&mut self, layer_id: &LayerId) {
|
||||
let slot = &mut self.slots[layer_id.index];
|
||||
|
||||
if slot.tag != layer_id.tag {
|
||||
return;
|
||||
}
|
||||
|
||||
match &slot.data {
|
||||
SlotData::Occupied(_layer) => {
|
||||
// TODO evict the layer
|
||||
}
|
||||
SlotData::Vacant(_) => unimplemented!(),
|
||||
}
|
||||
|
||||
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
|
||||
self.next_empty_slot_idx = Some(layer_id.index);
|
||||
|
||||
assert!(self.num_occupied > 0);
|
||||
self.num_occupied -= 1;
|
||||
|
||||
slot.tag = slot.tag.wrapping_add(1);
|
||||
}
|
||||
}
|
||||
@@ -1,55 +1,54 @@
|
||||
//! An ImageLayer represents an image or a snapshot of a segment at one particular LSN.
|
||||
//! It is stored in a file on disk.
|
||||
//! An ImageLayer represents an image or a snapshot of a key-range at
|
||||
//! one particular LSN. It contains an image of all key-value pairs
|
||||
//! in its key-range. Any key that falls into the image layer's range
|
||||
//! but does not exist in the layer, does not exist.
|
||||
//!
|
||||
//! On disk, the image files are stored in timelines/<timelineid> directory.
|
||||
//! Currently, there are no subdirectories, and each image layer file is named like this:
|
||||
//! An image layer is stored in a file on disk. The file is stored in
|
||||
//! timelines/<timelineid> directory. Currently, there are no
|
||||
//! subdirectories, and each image layer file is named like this:
|
||||
//!
|
||||
//! Note that segno is
|
||||
//! <spcnode>_<dbnode>_<relnode>_<forknum>_<segno>_<LSN>
|
||||
//! <key start>-<key end>__<LSN>
|
||||
//!
|
||||
//! For example:
|
||||
//!
|
||||
//! 1663_13990_2609_0_5_000000000169C348
|
||||
//! 000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
//!
|
||||
//! An image file is constructed using the 'bookfile' crate.
|
||||
//!
|
||||
//! Only metadata is loaded into memory by the load function.
|
||||
//! When images are needed, they are read directly from disk.
|
||||
//!
|
||||
//! For blocky relishes, the images are stored in BLOCKY_IMAGES_CHAPTER.
|
||||
//! All the images are required to be BLOCK_SIZE, which allows for random access.
|
||||
//!
|
||||
//! For non-blocky relishes, the image can be found in NONBLOCKY_IMAGE_CHAPTER.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::filename::{ImageFileName, PathOrConf};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentBlk, SegmentTag,
|
||||
BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::IMAGE_FILE_MAGIC;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Range;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use std::sync::{RwLock, RwLockReadGuard, TryLockError};
|
||||
|
||||
use bookfile::{Book, BookWriter, ChapterWriter};
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Magic constant to identify a Zenith segment image file
|
||||
pub const IMAGE_FILE_MAGIC: u32 = 0x5A616E01 + 1;
|
||||
/// Mapping from (key, lsn) -> page/WAL record
|
||||
/// byte ranges in VALUES_CHAPTER
|
||||
static INDEX_CHAPTER: u64 = 1;
|
||||
|
||||
/// Contains each block in block # order
|
||||
const BLOCKY_IMAGES_CHAPTER: u64 = 1;
|
||||
const NONBLOCKY_IMAGE_CHAPTER: u64 = 2;
|
||||
const VALUES_CHAPTER: u64 = 2;
|
||||
|
||||
/// Contains the [`Summary`] struct
|
||||
const SUMMARY_CHAPTER: u64 = 3;
|
||||
@@ -58,7 +57,7 @@ const SUMMARY_CHAPTER: u64 = 3;
|
||||
struct Summary {
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
key_range: Range<Key>,
|
||||
|
||||
lsn: Lsn,
|
||||
}
|
||||
@@ -68,19 +67,17 @@ impl From<&ImageLayer> for Summary {
|
||||
Self {
|
||||
tenantid: layer.tenantid,
|
||||
timelineid: layer.timelineid,
|
||||
seg: layer.seg,
|
||||
key_range: layer.key_range.clone(),
|
||||
|
||||
lsn: layer.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const BLOCK_SIZE: usize = 8192;
|
||||
|
||||
///
|
||||
/// ImageLayer is the in-memory data structure associated with an on-disk image
|
||||
/// file. We keep an ImageLayer in memory for each file, in the LayerMap. If a
|
||||
/// layer is in "loaded" state, we have a copy of the file in memory, in 'inner'.
|
||||
/// layer is in "loaded" state, we have a copy of the index in memory, in 'inner'.
|
||||
/// Otherwise the struct is just a placeholder for a file that exists on disk,
|
||||
/// and it needs to be loaded before using it in queries.
|
||||
///
|
||||
@@ -88,7 +85,7 @@ pub struct ImageLayer {
|
||||
path_or_conf: PathOrConf,
|
||||
pub tenantid: ZTenantId,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub seg: SegmentTag,
|
||||
pub key_range: Range<Key>,
|
||||
|
||||
// This entry contains an image of all pages as of this LSN
|
||||
pub lsn: Lsn,
|
||||
@@ -96,18 +93,16 @@ pub struct ImageLayer {
|
||||
inner: RwLock<ImageLayerInner>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum ImageType {
|
||||
Blocky { num_blocks: SegmentBlk },
|
||||
NonBlocky,
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If None, the 'image_type' has not been loaded into memory yet.
|
||||
/// If false, the 'index' has not been loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
/// The underlying (virtual) file handle. None if the layer hasn't been loaded
|
||||
/// yet.
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
/// offset of each value
|
||||
index: HashMap<Key, BlobRef>,
|
||||
}
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
@@ -123,98 +118,82 @@ impl Layer for ImageLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
self.key_range.clone()
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
// End-bound is exclusive
|
||||
self.lsn + 1
|
||||
self.lsn..(self.lsn + 1)
|
||||
}
|
||||
|
||||
/// Look up given page in the file
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> anyhow::Result<PageReconstructResult> {
|
||||
ensure!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
ensure!(lsn >= self.lsn);
|
||||
|
||||
match reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if self.lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
// Check if the request is beyond EOF
|
||||
if blknum >= *num_blocks {
|
||||
return Ok(PageReconstructResult::Missing(lsn));
|
||||
}
|
||||
if let Some(blob_ref) = inner.index.get(&key) {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(VALUES_CHAPTER)?;
|
||||
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * blknum as u64;
|
||||
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
|
||||
chapter.read_exact_at(&mut buf, offset).with_context(|| {
|
||||
let mut blob = vec![0; blob_ref.size()];
|
||||
chapter
|
||||
.read_exact_at(&mut blob, blob_ref.pos())
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to read page from data file {} at offset {}",
|
||||
"failed to read {} bytes from data file {} at offset {}",
|
||||
blob_ref.size(),
|
||||
self.filename().display(),
|
||||
offset
|
||||
blob_ref.pos()
|
||||
)
|
||||
})?;
|
||||
let value = Bytes::from(blob);
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(blknum == 0);
|
||||
inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
|
||||
.into_vec()
|
||||
}
|
||||
};
|
||||
|
||||
reconstruct_data.page_img = Some((self.lsn, Bytes::from(buf)));
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
|
||||
/// Get size of the segment
|
||||
fn get_seg_size(&self, _lsn: Lsn) -> Result<SegmentBlk> {
|
||||
let inner = self.load()?;
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => Ok(num_blocks),
|
||||
ImageType::NonBlocky => Err(anyhow!("get_seg_size called for non-blocky segment")),
|
||||
reconstruct_state.img = Some((self.lsn, value));
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
} else {
|
||||
Ok(ValueReconstructResult::Missing)
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, _lsn: Lsn) -> Result<bool> {
|
||||
Ok(true)
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
fn unload(&self) -> Result<()> {
|
||||
// Unload the index.
|
||||
//
|
||||
// TODO: we should access the index directly from pages on the disk,
|
||||
// using the buffer cache. This load/unload mechanism is really ad hoc.
|
||||
|
||||
// FIXME: In debug mode, loading and unloading the index slows
|
||||
// things down so much that you get timeout errors. At least
|
||||
// with the test_parallel_copy test. So as an even more ad hoc
|
||||
// stopgap fix for that, only unload every on average 10
|
||||
// checkpoint cycles.
|
||||
use rand::RngCore;
|
||||
if rand::thread_rng().next_u32() > (u32::MAX / 10) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut inner = match self.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(TryLockError::WouldBlock) => return Ok(()),
|
||||
Err(TryLockError::Poisoned(_)) => panic!("ImageLayer lock was poisoned"),
|
||||
};
|
||||
inner.index = HashMap::default();
|
||||
inner.loaded = false;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -235,22 +214,22 @@ impl Layer for ImageLayer {
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
"----- image layer for ten {} tli {} seg {} at {} ----",
|
||||
self.tenantid, self.timelineid, self.seg, self.lsn
|
||||
"----- image layer for ten {} tli {} key {}-{} at {} ----",
|
||||
self.tenantid, self.timelineid, self.key_range.start, self.key_range.end, self.lsn
|
||||
);
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
|
||||
ImageType::NonBlocky => {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
println!("non-blocky ({} bytes)", chapter.len());
|
||||
}
|
||||
let mut index_vec: Vec<(&Key, &BlobRef)> = inner.index.iter().collect();
|
||||
index_vec.sort_by_key(|x| x.1.pos());
|
||||
|
||||
for (key, blob_ref) in index_vec {
|
||||
println!(
|
||||
"key: {} size {} offset {}",
|
||||
key,
|
||||
blob_ref.size(),
|
||||
blob_ref.pos()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -280,7 +259,7 @@ impl ImageLayer {
|
||||
loop {
|
||||
// Quick exit if already loaded
|
||||
let inner = self.inner.read().unwrap();
|
||||
if inner.book.is_some() {
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
@@ -306,14 +285,16 @@ impl ImageLayer {
|
||||
|
||||
fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
|
||||
let path = self.path();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
|
||||
let book = Book::new(file).with_context(|| {
|
||||
format!(
|
||||
"Failed to open virtual file '{}' as a bookfile",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open file '{}'", path.display()))?;
|
||||
inner.book = Some(Book::new(file).with_context(|| {
|
||||
format!("Failed to open file '{}' as a bookfile", path.display())
|
||||
})?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -340,23 +321,13 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let images_len = chapter.len();
|
||||
ensure!(images_len % BLOCK_SIZE as u64 == 0);
|
||||
let num_blocks: SegmentBlk = (images_len / BLOCK_SIZE as u64).try_into()?;
|
||||
ImageType::Blocky { num_blocks }
|
||||
} else {
|
||||
let _chapter = book.chapter_reader(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let chapter = book.read_chapter(INDEX_CHAPTER)?;
|
||||
let index = HashMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
info!("loaded from {}", &path.display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
book: Some(book),
|
||||
image_type,
|
||||
};
|
||||
inner.index = index;
|
||||
inner.loaded = true;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -372,11 +343,12 @@ impl ImageLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg: filename.seg,
|
||||
key_range: filename.key_range.clone(),
|
||||
lsn: filename.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
index: HashMap::new(),
|
||||
loaded: false,
|
||||
}),
|
||||
}
|
||||
}
|
||||
@@ -395,18 +367,19 @@ impl ImageLayer {
|
||||
path_or_conf: PathOrConf::Path(path.to_path_buf()),
|
||||
timelineid: summary.timelineid,
|
||||
tenantid: summary.tenantid,
|
||||
seg: summary.seg,
|
||||
key_range: summary.key_range,
|
||||
lsn: summary.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
index: HashMap::new(),
|
||||
loaded: false,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
@@ -435,15 +408,18 @@ impl ImageLayer {
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
key_range: Range<Key>,
|
||||
lsn: Lsn,
|
||||
|
||||
num_blocks: SegmentBlk,
|
||||
values_writer: Option<ChapterWriter<BufWriter<VirtualFile>>>,
|
||||
end_offset: u64,
|
||||
|
||||
page_image_writer: ChapterWriter<BufWriter<VirtualFile>>,
|
||||
num_blocks_written: SegmentBlk,
|
||||
index: HashMap<Key, BlobRef>,
|
||||
|
||||
finished: bool,
|
||||
}
|
||||
|
||||
impl ImageLayerWriter {
|
||||
@@ -451,9 +427,8 @@ impl ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
num_blocks: SegmentBlk,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
// Create the file
|
||||
//
|
||||
@@ -463,70 +438,75 @@ impl ImageLayerWriter {
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName { seg, lsn },
|
||||
&ImageFileName {
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
},
|
||||
);
|
||||
info!("new image layer {}", path.display());
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
// Open the page-images chapter for writing. The calls to
|
||||
// `put_page_image` will use this to write the contents.
|
||||
let chapter = if seg.rel.is_blocky() {
|
||||
book.new_chapter(BLOCKY_IMAGES_CHAPTER)
|
||||
} else {
|
||||
ensure!(num_blocks == 1);
|
||||
book.new_chapter(NONBLOCKY_IMAGE_CHAPTER)
|
||||
};
|
||||
// `put_image` will use this to write the contents.
|
||||
let chapter = book.new_chapter(VALUES_CHAPTER);
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
key_range: key_range.clone(),
|
||||
lsn,
|
||||
num_blocks,
|
||||
page_image_writer: chapter,
|
||||
num_blocks_written: 0,
|
||||
values_writer: Some(chapter),
|
||||
index: HashMap::new(),
|
||||
end_offset: 0,
|
||||
finished: false,
|
||||
};
|
||||
|
||||
Ok(writer)
|
||||
}
|
||||
|
||||
///
|
||||
/// Write next page image to the file.
|
||||
/// Write next value to the file.
|
||||
///
|
||||
/// The page versions must be appended in blknum order.
|
||||
///
|
||||
pub fn put_page_image(&mut self, block_bytes: &[u8]) -> anyhow::Result<()> {
|
||||
ensure!(self.num_blocks_written < self.num_blocks);
|
||||
if self.seg.rel.is_blocky() {
|
||||
ensure!(block_bytes.len() == BLOCK_SIZE);
|
||||
pub fn put_image(&mut self, key: Key, img: &[u8]) -> Result<()> {
|
||||
ensure!(self.key_range.contains(&key));
|
||||
let off = self.end_offset;
|
||||
|
||||
if let Some(writer) = &mut self.values_writer {
|
||||
let len = img.len();
|
||||
writer.write_all(img)?;
|
||||
self.end_offset += len as u64;
|
||||
|
||||
let old = self.index.insert(key, BlobRef::new(off, len, true));
|
||||
assert!(old.is_none());
|
||||
} else {
|
||||
panic!()
|
||||
}
|
||||
self.page_image_writer.write_all(block_bytes)?;
|
||||
self.num_blocks_written += 1;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> anyhow::Result<ImageLayer> {
|
||||
// Check that the `put_page_image' was called for every block.
|
||||
ensure!(self.num_blocks_written == self.num_blocks);
|
||||
pub fn finish(&mut self) -> anyhow::Result<ImageLayer> {
|
||||
// Close the values chapter
|
||||
let book = self.values_writer.take().unwrap().close()?;
|
||||
|
||||
// Close the page-images chapter
|
||||
let book = self.page_image_writer.close()?;
|
||||
// Write out the index
|
||||
let mut chapter = book.new_chapter(INDEX_CHAPTER);
|
||||
let buf = HashMap::ser(&self.index)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// Write out the summary chapter
|
||||
let image_type = if self.seg.rel.is_blocky() {
|
||||
ImageType::Blocky {
|
||||
num_blocks: self.num_blocks,
|
||||
}
|
||||
} else {
|
||||
ImageType::NonBlocky
|
||||
};
|
||||
let mut chapter = book.new_chapter(SUMMARY_CHAPTER);
|
||||
let summary = Summary {
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
};
|
||||
Summary::ser_into(&summary, &mut chapter)?;
|
||||
@@ -542,15 +522,31 @@ impl ImageLayerWriter {
|
||||
path_or_conf: PathOrConf::Conf(self.conf),
|
||||
timelineid: self.timelineid,
|
||||
tenantid: self.tenantid,
|
||||
seg: self.seg,
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
inner: RwLock::new(ImageLayerInner {
|
||||
book: None,
|
||||
image_type,
|
||||
loaded: false,
|
||||
index: HashMap::new(),
|
||||
}),
|
||||
};
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
self.finished = true;
|
||||
|
||||
Ok(layer)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ImageLayerWriter {
|
||||
fn drop(&mut self) {
|
||||
if let Some(page_image_writer) = self.values_writer.take() {
|
||||
if let Ok(book) = page_image_writer.close() {
|
||||
let _ = book.close();
|
||||
}
|
||||
}
|
||||
if !self.finished {
|
||||
let _ = fs::remove_file(&self.path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,30 +1,29 @@
|
||||
//! An in-memory layer stores recently received PageVersions.
|
||||
//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
|
||||
//! and layers can be spilled to disk into ephemeral files.
|
||||
//! An in-memory layer stores recently received key-value pairs.
|
||||
//!
|
||||
//! And there's another BTreeMap to track the size of the relation.
|
||||
//! The "in-memory" part of the name is a bit misleading: the actual page versions are
|
||||
//! held in an ephemeral file, not in memory. The metadata for each page version, i.e.
|
||||
//! its position in the file, is kept in memory, though.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::delta_layer::{DeltaLayer, DeltaLayerWriter};
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::image_layer::{ImageLayer, ImageLayerWriter};
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentBlk, SegmentTag,
|
||||
RELISH_SEG_SIZE,
|
||||
BlobRef, Layer, ValueReconstructResult, ValueReconstructState,
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::ZERO_PAGE;
|
||||
use crate::repository::ZenithWalRecord;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::collections::HashMap;
|
||||
use std::io::Seek;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
@@ -33,7 +32,6 @@ pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
seg: SegmentTag,
|
||||
|
||||
///
|
||||
/// This layer contains all the changes from 'start_lsn'. The
|
||||
@@ -41,27 +39,9 @@ pub struct InMemoryLayer {
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
///
|
||||
/// LSN of the oldest page version stored in this layer.
|
||||
///
|
||||
/// This is different from 'start_lsn' in that we enforce that the 'start_lsn'
|
||||
/// of a layer always matches the 'end_lsn' of its predecessor, even if there
|
||||
/// are no page versions until at a later LSN. That way you can detect any
|
||||
/// missing layer files more easily. 'oldest_lsn' is the first page version
|
||||
/// actually stored in this layer. In the range between 'start_lsn' and
|
||||
/// 'oldest_lsn', there are no changes to the segment.
|
||||
/// 'oldest_lsn' is used to adjust 'disk_consistent_lsn' and that is why it should
|
||||
/// point to the beginning of WAL record. This is the other difference with 'start_lsn'
|
||||
/// which points to end of WAL record. This is why 'oldest_lsn' can be smaller than 'start_lsn'.
|
||||
///
|
||||
oldest_lsn: Lsn,
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
@@ -69,98 +49,25 @@ pub struct InMemoryLayerInner {
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
///
|
||||
/// All versions of all pages in the layer are kept here. Indexed
|
||||
/// by block number and LSN. The value is an offset into the
|
||||
/// ephemeral file where the page version is stored.
|
||||
///
|
||||
index: HashMap<Key, VecMap<Lsn, BlobRef>>,
|
||||
|
||||
/// The PageVersion structs are stored in a serialized format in this file.
|
||||
/// Each serialized PageVersion is preceded by a 'u32' length field.
|
||||
/// 'page_versions' map stores offsets into this file.
|
||||
/// The values are stored in a serialized format in this file.
|
||||
/// Each serialized Value is preceded by a 'u32' length field.
|
||||
/// PerSeg::page_versions map stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
|
||||
/// Metadata about all versions of all pages in the layer is kept
|
||||
/// here. Indexed by block number and LSN. The value is an offset
|
||||
/// into the ephemeral file where the page version is stored.
|
||||
page_versions: HashMap<SegmentBlk, VecMap<Lsn, u64>>,
|
||||
|
||||
///
|
||||
/// `seg_sizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'seg_sizes' is not used and is always empty.
|
||||
///
|
||||
seg_sizes: VecMap<Lsn, SegmentBlk>,
|
||||
|
||||
///
|
||||
/// LSN of the newest page version stored in this layer.
|
||||
///
|
||||
/// The difference between 'end_lsn' and 'latest_lsn' is the same as between
|
||||
/// 'start_lsn' and 'oldest_lsn'. See comments in 'oldest_lsn'.
|
||||
///
|
||||
latest_lsn: Lsn,
|
||||
end_offset: u64,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> SegmentBlk {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let slice = self.seg_sizes.slice_range(..=lsn);
|
||||
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
*entry
|
||||
} else {
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Read a page version from the ephemeral file.
|
||||
///
|
||||
fn read_pv(&self, off: u64) -> Result<PageVersion> {
|
||||
let mut buf = Vec::new();
|
||||
self.read_pv_bytes(off, &mut buf)?;
|
||||
Ok(PageVersion::des(&buf)?)
|
||||
}
|
||||
|
||||
///
|
||||
/// Read a page version from the ephemeral file, as raw bytes, at
|
||||
/// the given offset. The bytes are read into 'buf', which is
|
||||
/// expanded if necessary. Returns the size of the page version.
|
||||
///
|
||||
fn read_pv_bytes(&self, off: u64, buf: &mut Vec<u8>) -> Result<usize> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, off)?;
|
||||
let len = u32::from_ne_bytes(lenbuf) as usize;
|
||||
|
||||
if buf.len() < len {
|
||||
buf.resize(len, 0);
|
||||
}
|
||||
self.file.read_exact_at(&mut buf[0..len], off + 4)?;
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_pv(&mut self, pv: &PageVersion) -> Result<u64> {
|
||||
// remember starting position
|
||||
let pos = self.file.stream_position()?;
|
||||
|
||||
// make room for the 'length' field by writing zeros as a placeholder.
|
||||
self.file.seek(std::io::SeekFrom::Start(pos + 4))?;
|
||||
|
||||
pv.ser_into(&mut self.file)?;
|
||||
|
||||
// write the 'length' field.
|
||||
let len = self.file.stream_position()? - pos - 4;
|
||||
let lenbuf = u32::to_ne_bytes(len as u32);
|
||||
self.file.write_all_at(&lenbuf, pos)?;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
@@ -170,21 +77,12 @@ impl Layer for InMemoryLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn = if let Some(drop_lsn) = inner.end_lsn {
|
||||
drop_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
};
|
||||
let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped: inner.dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
PathBuf::from(format!(
|
||||
"inmem-{:016X}-{:016X}",
|
||||
self.start_lsn.0, end_lsn.0
|
||||
))
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
@@ -195,132 +93,78 @@ impl Layer for InMemoryLayer {
|
||||
self.timelineid
|
||||
}
|
||||
|
||||
fn get_seg_tag(&self) -> SegmentTag {
|
||||
self.seg
|
||||
fn get_key_range(&self) -> Range<Key> {
|
||||
Key::MIN..Key::MAX
|
||||
}
|
||||
|
||||
fn get_start_lsn(&self) -> Lsn {
|
||||
self.start_lsn
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
fn get_lsn_range(&self) -> Range<Lsn> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
let end_lsn = if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
};
|
||||
self.start_lsn..end_lsn
|
||||
}
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.dropped
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
fn get_page_reconstruct_data(
|
||||
/// Look up given value in the layer.
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> anyhow::Result<PageReconstructResult> {
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start <= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.page_versions.get(&blknum) {
|
||||
let slice = vec_map.slice_range(..=lsn);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_data.page_img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, blob_ref) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = inner.read_pv(*pos)?;
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
reconstruct_data.page_img = Some((*entry_lsn, img));
|
||||
let mut buf = vec![0u8; blob_ref.size()];
|
||||
inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((*entry_lsn, img));
|
||||
return Ok(ValueReconstructResult::Complete);
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let will_init = rec.will_init();
|
||||
reconstruct_state.records.push((*entry_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init() {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we didn't find any records for this, check if the request is beyond EOF
|
||||
if need_image
|
||||
&& reconstruct_data.records.is_empty()
|
||||
&& self.seg.rel.is_blocky()
|
||||
&& blknum >= self.get_seg_size(lsn)?
|
||||
{
|
||||
return Ok(PageReconstructResult::Missing(self.start_lsn));
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// release lock on 'inner'
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know
|
||||
// caller know.
|
||||
if need_image {
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
Ok(ValueReconstructResult::Continue)
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> anyhow::Result<SegmentBlk> {
|
||||
ensure!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> anyhow::Result<bool> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// If the segment created after requested LSN,
|
||||
// it doesn't exist in the layer. But we shouldn't
|
||||
// have requested it in the first place.
|
||||
ensure!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if inner.dropped {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
bail!("dropped in-memory layer with no end LSN");
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, it exists
|
||||
Ok(true)
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>>> {
|
||||
todo!();
|
||||
}
|
||||
|
||||
/// Cannot unload anything in an in-memory layer, since there's no backing
|
||||
@@ -337,7 +181,8 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.incremental
|
||||
// in-memory layer is always considered incremental.
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
@@ -355,29 +200,36 @@ impl Layer for InMemoryLayer {
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
"----- in-memory layer for tli {} LSNs {}-{} ----",
|
||||
self.timelineid, self.start_lsn, end_str,
|
||||
);
|
||||
|
||||
for (k, v) in inner.seg_sizes.as_slice() {
|
||||
println!("seg_sizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
// List the blocks in order
|
||||
let mut page_versions: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> =
|
||||
inner.page_versions.iter().collect();
|
||||
page_versions.sort_by_key(|k| k.0);
|
||||
|
||||
for (blknum, versions) in page_versions {
|
||||
for (lsn, off) in versions.as_slice() {
|
||||
let pv = inner.read_pv(*off);
|
||||
let pv_description = match pv {
|
||||
Ok(PageVersion::Page(_img)) => "page",
|
||||
Ok(PageVersion::Wal(_rec)) => "wal",
|
||||
Err(_err) => "INVALID",
|
||||
};
|
||||
|
||||
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
|
||||
let mut buf = Vec::new();
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
for (lsn, blob_ref) in vec_map.as_slice() {
|
||||
let mut desc = String::new();
|
||||
buf.resize(blob_ref.size(), 0);
|
||||
inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err)?;
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -385,23 +237,7 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_lsn(&self) -> Lsn {
|
||||
self.oldest_lsn
|
||||
}
|
||||
|
||||
pub fn get_latest_lsn(&self) -> Lsn {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.latest_lsn
|
||||
}
|
||||
|
||||
///
|
||||
/// Create a new, empty, in-memory layer
|
||||
///
|
||||
@@ -409,291 +245,83 @@ impl InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
seg: SegmentTag,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
trace!(
|
||||
"initializing new empty InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
"initializing new empty InMemoryLayer for writing on timeline {} at {}",
|
||||
timelineid,
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'seg_sizes' with 0.
|
||||
let mut seg_sizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
seg_sizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
index: HashMap::new(),
|
||||
file,
|
||||
page_versions: HashMap::new(),
|
||||
seg_sizes,
|
||||
latest_lsn: oldest_lsn,
|
||||
end_offset: 0,
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
blknum: SegmentBlk,
|
||||
rec: ZenithWalRecord,
|
||||
) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: SegmentBlk, lsn: Lsn, img: Bytes) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Page(img))
|
||||
}
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
pv: PageVersion,
|
||||
) -> anyhow::Result<u32> {
|
||||
ensure!((0..RELISH_SEG_SIZE).contains(&blknum));
|
||||
|
||||
trace!(
|
||||
"put_page_version blk {} of {} at {}/{}",
|
||||
blknum,
|
||||
self.seg.rel,
|
||||
self.timelineid,
|
||||
lsn
|
||||
);
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
ensure!(lsn >= inner.latest_lsn);
|
||||
inner.latest_lsn = lsn;
|
||||
|
||||
// Write the page version to the file, and remember its offset in 'page_versions'
|
||||
{
|
||||
let off = inner.write_pv(&pv)?;
|
||||
let vec_map = inner.page_versions.entry(blknum).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!(
|
||||
"Page version of rel {} blk {} at {} already exists",
|
||||
self.seg.rel, blknum, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Also update the relation size, if this extended the relation.
|
||||
if self.seg.rel.is_blocky() {
|
||||
let newsize = blknum + 1;
|
||||
|
||||
// use inner get_seg_size, since calling self.get_seg_size will try to acquire the lock,
|
||||
// which we've just acquired above
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
if newsize > oldsize {
|
||||
trace!(
|
||||
"enlarging segment {} from {} to {} blocks at {}",
|
||||
self.seg,
|
||||
oldsize,
|
||||
newsize,
|
||||
lsn
|
||||
);
|
||||
|
||||
// If we are extending the relation by more than one page, initialize the "gap"
|
||||
// with zeros
|
||||
//
|
||||
// XXX: What if the caller initializes the gap with subsequent call with same LSN?
|
||||
// I don't think that can happen currently, but that is highly dependent on how
|
||||
// PostgreSQL writes its WAL records and there's no guarantee of it. If it does
|
||||
// happen, we would hit the "page version already exists" warning above on the
|
||||
// subsequent call to initialize the gap page.
|
||||
for gapblknum in oldsize..blknum {
|
||||
let zeropv = PageVersion::Page(ZERO_PAGE.clone());
|
||||
trace!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
|
||||
// Write the page version to the file, and remember its offset in
|
||||
// 'page_versions'
|
||||
{
|
||||
let off = inner.write_pv(&zeropv)?;
|
||||
let vec_map = inner.page_versions.entry(gapblknum).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
if old.is_some() {
|
||||
warn!(
|
||||
"Page version of seg {} blk {} at {} already exists",
|
||||
self.seg, gapblknum, lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inner.seg_sizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return Ok(newsize - oldsize);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, new_size: SegmentBlk) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let old_size = inner.get_seg_size(lsn);
|
||||
assert!(new_size < old_size);
|
||||
|
||||
let (old, _delta_size) = inner
|
||||
.seg_sizes
|
||||
.append_or_update_last(lsn, new_size)
|
||||
.unwrap();
|
||||
let off = inner.end_offset;
|
||||
let buf = Value::ser(&val)?;
|
||||
let len = buf.len();
|
||||
inner.file.write_all(&buf)?;
|
||||
inner.end_offset += len as u64;
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let blob_ref = BlobRef::new(off, len, val.will_init());
|
||||
let old = vec_map.append_or_update_last(lsn, blob_ref).unwrap().0;
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize a new InMemoryLayer for, by copying the state at the given
|
||||
/// point in time from given existing layer.
|
||||
///
|
||||
pub fn create_successor_layer(
|
||||
conf: &'static PageServerConf,
|
||||
src: Arc<dyn Layer>,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Result<InMemoryLayer> {
|
||||
let seg = src.get_seg_tag();
|
||||
|
||||
assert!(oldest_lsn.is_aligned());
|
||||
|
||||
trace!(
|
||||
"initializing new InMemoryLayer for writing {} on timeline {} at {}",
|
||||
seg,
|
||||
timelineid,
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut seg_sizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
seg_sizes.append(start_lsn, size).unwrap();
|
||||
warn!("Key {} at {} already exists", key, lsn);
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
file,
|
||||
page_versions: HashMap::new(),
|
||||
seg_sizes,
|
||||
latest_lsn: oldest_lsn,
|
||||
}),
|
||||
})
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.end_lsn.is_none()
|
||||
pub fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
|
||||
// TODO: Currently, we just leak the storage for any deleted keys
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
/// `end_lsn` is exclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
} else {
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
assert!(self.start_lsn < end_lsn);
|
||||
inner.end_lsn = Some(end_lsn);
|
||||
|
||||
if let Some((lsn, _)) = inner.seg_sizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
|
||||
for (_blk, vec_map) in inner.page_versions.iter() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn <= end_lsn);
|
||||
}
|
||||
for vec_map in inner.index.values() {
|
||||
for (lsn, _pos) in vec_map.as_slice() {
|
||||
assert!(*lsn < end_lsn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
/// Write this frozen in-memory layer to disk.
|
||||
///
|
||||
/// Returns new layers that replace this one.
|
||||
/// If not dropped and reconstruct_pages is true, returns a new image layer containing the page versions
|
||||
/// at the `end_lsn`. Can also return a DeltaLayer that includes all the
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(
|
||||
&self,
|
||||
timeline: &LayeredTimeline,
|
||||
reconstruct_pages: bool,
|
||||
) -> Result<LayersOnDisk> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
/// Returns a new delta layer with all the same data as this in-memory layer
|
||||
pub fn write_to_disk(&self) -> Result<DeltaLayer> {
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
@@ -705,105 +333,32 @@ impl InMemoryLayer {
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Since `end_lsn` is exclusive, subtract 1 to calculate the last LSN
|
||||
// that is included.
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
Key::MIN,
|
||||
self.start_lsn..inner.end_lsn.unwrap(),
|
||||
)?;
|
||||
|
||||
// Figure out if we should create a delta layer, image layer, or both.
|
||||
let image_lsn: Option<Lsn>;
|
||||
let delta_end_lsn: Option<Lsn>;
|
||||
if self.is_dropped() || !reconstruct_pages {
|
||||
// The segment was dropped. Create just a delta layer containing all the
|
||||
// changes up to and including the drop.
|
||||
delta_end_lsn = Some(end_lsn_exclusive);
|
||||
image_lsn = None;
|
||||
} else if self.start_lsn == end_lsn_inclusive {
|
||||
// The layer contains exactly one LSN. It's enough to write an image
|
||||
// layer at that LSN.
|
||||
delta_end_lsn = None;
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
} else {
|
||||
// Create a delta layer with all the changes up to the end LSN,
|
||||
// and an image layer at the end LSN.
|
||||
//
|
||||
// Note that we the delta layer does *not* include the page versions
|
||||
// at the end LSN. They are included in the image layer, and there's
|
||||
// no need to store them twice.
|
||||
delta_end_lsn = Some(end_lsn_inclusive);
|
||||
image_lsn = Some(end_lsn_inclusive);
|
||||
}
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
|
||||
if let Some(delta_end_lsn) = delta_end_lsn {
|
||||
let mut delta_layer_writer = DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
delta_end_lsn,
|
||||
self.is_dropped(),
|
||||
)?;
|
||||
|
||||
// Write all page versions, in block + LSN order
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
|
||||
let pv_iter = inner.page_versions.iter();
|
||||
let mut pages: Vec<(&SegmentBlk, &VecMap<Lsn, u64>)> = pv_iter.collect();
|
||||
pages.sort_by_key(|(blknum, _vec_map)| *blknum);
|
||||
for (blknum, vec_map) in pages {
|
||||
for (lsn, pos) in vec_map.as_slice() {
|
||||
if *lsn < delta_end_lsn {
|
||||
let len = inner.read_pv_bytes(*pos, &mut buf)?;
|
||||
delta_layer_writer.put_page_version(*blknum, *lsn, &buf[..len])?;
|
||||
}
|
||||
let mut do_steps = || -> Result<()> {
|
||||
for (key, vec_map) in inner.index.iter() {
|
||||
// Write all page versions
|
||||
for (lsn, blob_ref) in vec_map.as_slice() {
|
||||
let mut buf = vec![0u8; blob_ref.size()];
|
||||
inner.file.read_exact_at(&mut buf, blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
delta_layer_writer.put_value(*key, *lsn, val)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Create seg_sizes
|
||||
let seg_sizes = if delta_end_lsn == end_lsn_exclusive {
|
||||
inner.seg_sizes.clone()
|
||||
} else {
|
||||
inner.seg_sizes.split_at(&end_lsn_exclusive).0
|
||||
};
|
||||
|
||||
let delta_layer = delta_layer_writer.finish(seg_sizes)?;
|
||||
delta_layers.push(delta_layer);
|
||||
Ok(())
|
||||
};
|
||||
if let Err(err) = do_steps() {
|
||||
delta_layer_writer.abort();
|
||||
return Err(err);
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
if let Some(image_lsn) = image_lsn {
|
||||
let size = if self.seg.rel.is_blocky() {
|
||||
self.get_seg_size(image_lsn)?
|
||||
} else {
|
||||
1
|
||||
};
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
image_lsn,
|
||||
size,
|
||||
)?;
|
||||
|
||||
for blknum in 0..size {
|
||||
let img = timeline.materialize_page(self.seg, blknum, image_lsn, &*self)?;
|
||||
|
||||
image_layer_writer.put_page_image(&img)?;
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
image_layers.push(image_layer);
|
||||
}
|
||||
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers,
|
||||
})
|
||||
let delta_layer = delta_layer_writer.finish(Key::MAX)?;
|
||||
Ok(delta_layer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,468 +0,0 @@
|
||||
///
|
||||
/// IntervalTree is data structure for holding intervals. It is generic
|
||||
/// to make unit testing possible, but the only real user of it is the layer map,
|
||||
///
|
||||
/// It's inspired by the "segment tree" or a "statistic tree" as described in
|
||||
/// https://en.wikipedia.org/wiki/Segment_tree. However, we use a B-tree to hold
|
||||
/// the points instead of a binary tree. This is called an "interval tree" instead
|
||||
/// of "segment tree" because the term "segment" is already using Zenith to mean
|
||||
/// something else. To add to the confusion, there is another data structure known
|
||||
/// as "interval tree" out there (see https://en.wikipedia.org/wiki/Interval_tree),
|
||||
/// for storing intervals, but this isn't that.
|
||||
///
|
||||
/// The basic idea is to have a B-tree of "interesting Points". At each Point,
|
||||
/// there is a list of intervals that contain the point. The Points are formed
|
||||
/// from the start bounds of each interval; there is a Point for each distinct
|
||||
/// start bound.
|
||||
///
|
||||
/// Operations:
|
||||
///
|
||||
/// To find intervals that contain a given point, you search the b-tree to find
|
||||
/// the nearest Point <= search key. Then you just return the list of intervals.
|
||||
///
|
||||
/// To insert an interval, find the Point with start key equal to the inserted item.
|
||||
/// If the Point doesn't exist yet, create it, by copying all the items from the
|
||||
/// previous Point that cover the new Point. Then walk right, inserting the new
|
||||
/// interval to all the Points that are contained by the new interval (including the
|
||||
/// newly created Point).
|
||||
///
|
||||
/// To remove an interval, you scan the tree for all the Points that are contained by
|
||||
/// the removed interval, and remove it from the list in each Point.
|
||||
///
|
||||
/// Requirements and assumptions:
|
||||
///
|
||||
/// - Can store overlapping items
|
||||
/// - But there are not many overlapping items
|
||||
/// - The interval bounds don't change after it is added to the tree
|
||||
/// - Intervals are uniquely identified by pointer equality. You must not be insert the
|
||||
/// same interval object twice, and `remove` uses pointer equality to remove the right
|
||||
/// interval. It is OK to have two intervals with the same bounds, however.
|
||||
///
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub struct IntervalTree<I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
points: BTreeMap<I::Key, Point<I>>,
|
||||
}
|
||||
|
||||
struct Point<I: ?Sized> {
|
||||
/// All intervals that contain this point, in no particular order.
|
||||
///
|
||||
/// We assume that there aren't a lot of overlappingg intervals, so that this vector
|
||||
/// never grows very large. If that assumption doesn't hold, we could keep this ordered
|
||||
/// by the end bound, to speed up `search`. But as long as there are only a few elements,
|
||||
/// a linear search is OK.
|
||||
elements: Vec<Arc<I>>,
|
||||
}
|
||||
|
||||
/// Abstraction for an interval that can be stored in the tree
|
||||
///
|
||||
/// The start bound is inclusive and the end bound is exclusive. End must be greater
|
||||
/// than start.
|
||||
pub trait IntervalItem {
|
||||
type Key: Ord + Copy + Debug + Sized;
|
||||
|
||||
fn start_key(&self) -> Self::Key;
|
||||
fn end_key(&self) -> Self::Key;
|
||||
|
||||
fn bounds(&self) -> Range<Self::Key> {
|
||||
self.start_key()..self.end_key()
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
/// Return an element that contains 'key', or precedes it.
|
||||
///
|
||||
/// If there are multiple candidates, returns the one with the highest 'end' key.
|
||||
pub fn search(&self, key: I::Key) -> Option<Arc<I>> {
|
||||
// Find the greatest point that precedes or is equal to the search key. If there is
|
||||
// none, returns None.
|
||||
let (_, p) = self.points.range(..=key).next_back()?;
|
||||
|
||||
// Find the element with the highest end key at this point
|
||||
let highest_item = p
|
||||
.elements
|
||||
.iter()
|
||||
.reduce(|a, b| {
|
||||
// starting with Rust 1.53, could use `std::cmp::min_by_key` here
|
||||
if a.end_key() > b.end_key() {
|
||||
a
|
||||
} else {
|
||||
b
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
Some(Arc::clone(highest_item))
|
||||
}
|
||||
|
||||
/// Iterate over all items with start bound >= 'key'
|
||||
pub fn iter_newer(&self, key: I::Key) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(key..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterate over all items
|
||||
pub fn iter(&self) -> IntervalIter<I> {
|
||||
IntervalIter {
|
||||
point_iter: self.points.range(..),
|
||||
elem_iter: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, item: Arc<I>) {
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
assert!(start_key < end_key);
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
// Find the starting point and walk forward from there
|
||||
let mut found_start_point = false;
|
||||
let iter = self.points.range_mut(bounds);
|
||||
for (point_key, point) in iter {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
// It is an error to insert the same item to the tree twice.
|
||||
assert!(
|
||||
!point.elements.iter().any(|x| Arc::ptr_eq(x, &item)),
|
||||
"interval is already in the tree"
|
||||
);
|
||||
}
|
||||
point.elements.push(Arc::clone(&item));
|
||||
}
|
||||
if !found_start_point {
|
||||
// Create a new Point for the starting point
|
||||
|
||||
// Look at the previous point, and copy over elements that overlap with this
|
||||
// new point
|
||||
let mut new_elements: Vec<Arc<I>> = Vec::new();
|
||||
if let Some((_, prev_point)) = self.points.range(..start_key).next_back() {
|
||||
let overlapping_prev_elements = prev_point
|
||||
.elements
|
||||
.iter()
|
||||
.filter(|x| x.bounds().contains(&start_key))
|
||||
.cloned();
|
||||
|
||||
new_elements.extend(overlapping_prev_elements);
|
||||
}
|
||||
new_elements.push(item);
|
||||
|
||||
let new_point = Point {
|
||||
elements: new_elements,
|
||||
};
|
||||
self.points.insert(start_key, new_point);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, item: &Arc<I>) {
|
||||
// range search points
|
||||
let start_key = item.start_key();
|
||||
let end_key = item.end_key();
|
||||
let bounds = start_key..end_key;
|
||||
|
||||
let mut points_to_remove: Vec<I::Key> = Vec::new();
|
||||
let mut found_start_point = false;
|
||||
for (point_key, point) in self.points.range_mut(bounds) {
|
||||
if *point_key == start_key {
|
||||
found_start_point = true;
|
||||
}
|
||||
let len_before = point.elements.len();
|
||||
point.elements.retain(|other| !Arc::ptr_eq(other, item));
|
||||
let len_after = point.elements.len();
|
||||
assert_eq!(len_after + 1, len_before);
|
||||
if len_after == 0 {
|
||||
points_to_remove.push(*point_key);
|
||||
}
|
||||
}
|
||||
assert!(found_start_point);
|
||||
|
||||
for k in points_to_remove {
|
||||
self.points.remove(&k).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntervalIter<'a, I: ?Sized>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
point_iter: std::collections::btree_map::Range<'a, I::Key, Point<I>>,
|
||||
elem_iter: Option<(I::Key, std::slice::Iter<'a, Arc<I>>)>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for IntervalIter<'a, I>
|
||||
where
|
||||
I: IntervalItem + ?Sized,
|
||||
{
|
||||
type Item = Arc<I>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// Iterate over all elements in all the points in 'point_iter'. To avoid
|
||||
// returning the same element twice, we only return each element at its
|
||||
// starting point.
|
||||
loop {
|
||||
// Return next remaining element from the current point
|
||||
if let Some((point_key, elem_iter)) = &mut self.elem_iter {
|
||||
for elem in elem_iter {
|
||||
if elem.start_key() == *point_key {
|
||||
return Some(Arc::clone(elem));
|
||||
}
|
||||
}
|
||||
}
|
||||
// No more elements at this point. Move to next point.
|
||||
if let Some((point_key, point)) = self.point_iter.next() {
|
||||
self.elem_iter = Some((*point_key, point.elements.iter()));
|
||||
continue;
|
||||
} else {
|
||||
// No more points, all done
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: ?Sized> Default for IntervalTree<I>
|
||||
where
|
||||
I: IntervalItem,
|
||||
{
|
||||
fn default() -> Self {
|
||||
IntervalTree {
|
||||
points: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct MockItem {
|
||||
start_key: u32,
|
||||
end_key: u32,
|
||||
val: String,
|
||||
}
|
||||
impl IntervalItem for MockItem {
|
||||
type Key = u32;
|
||||
|
||||
fn start_key(&self) -> u32 {
|
||||
self.start_key
|
||||
}
|
||||
fn end_key(&self) -> u32 {
|
||||
self.end_key
|
||||
}
|
||||
}
|
||||
impl MockItem {
|
||||
fn new(start_key: u32, end_key: u32) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}", start_key, end_key),
|
||||
}
|
||||
}
|
||||
fn new_str(start_key: u32, end_key: u32, val: &str) -> Self {
|
||||
MockItem {
|
||||
start_key,
|
||||
end_key,
|
||||
val: format!("{}-{}: {}", start_key, end_key, val),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl fmt::Display for MockItem {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
expected: &[&str],
|
||||
) -> Option<Arc<MockItem>> {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn assert_contents(tree: &IntervalTree<MockItem>, expected: &[&str]) {
|
||||
let mut contents: Vec<String> = tree.iter().map(|e| e.to_string()).collect();
|
||||
contents.sort();
|
||||
assert_eq!(contents, expected);
|
||||
}
|
||||
|
||||
fn dump_tree(tree: &IntervalTree<MockItem>) {
|
||||
for (point_key, point) in tree.points.iter() {
|
||||
print!("{}:", point_key);
|
||||
for e in point.elements.iter() {
|
||||
print!(" {}", e);
|
||||
}
|
||||
println!();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_simple() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Simple, non-overlapping ranges.
|
||||
tree.insert(Arc::new(MockItem::new(10, 11)));
|
||||
tree.insert(Arc::new(MockItem::new(11, 12)));
|
||||
tree.insert(Arc::new(MockItem::new(12, 13)));
|
||||
tree.insert(Arc::new(MockItem::new(18, 19)));
|
||||
tree.insert(Arc::new(MockItem::new(17, 18)));
|
||||
tree.insert(Arc::new(MockItem::new(15, 16)));
|
||||
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &["10-11"]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["12-13"]);
|
||||
assert_search(&tree, 13, &["12-13"]);
|
||||
assert_search(&tree, 14, &["12-13"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 16, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["18-19"]);
|
||||
assert_search(&tree, 19, &["18-19"]);
|
||||
assert_search(&tree, 20, &["18-19"]);
|
||||
|
||||
// remove a few entries and search around them again
|
||||
tree.remove(&assert_search(&tree, 10, &["10-11"]).unwrap()); // first entry
|
||||
tree.remove(&assert_search(&tree, 12, &["12-13"]).unwrap()); // entry in the middle
|
||||
tree.remove(&assert_search(&tree, 18, &["18-19"]).unwrap()); // last entry
|
||||
assert_search(&tree, 9, &[]);
|
||||
assert_search(&tree, 10, &[]);
|
||||
assert_search(&tree, 11, &["11-12"]);
|
||||
assert_search(&tree, 12, &["11-12"]);
|
||||
assert_search(&tree, 14, &["11-12"]);
|
||||
assert_search(&tree, 15, &["15-16"]);
|
||||
assert_search(&tree, 17, &["17-18"]);
|
||||
assert_search(&tree, 18, &["17-18"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_overlap() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Overlapping items
|
||||
tree.insert(Arc::new(MockItem::new(22, 24)));
|
||||
tree.insert(Arc::new(MockItem::new(23, 25)));
|
||||
let x24_26 = Arc::new(MockItem::new(24, 26));
|
||||
tree.insert(Arc::clone(&x24_26));
|
||||
let x26_28 = Arc::new(MockItem::new(26, 28));
|
||||
tree.insert(Arc::clone(&x26_28));
|
||||
tree.insert(Arc::new(MockItem::new(25, 27)));
|
||||
|
||||
assert_search(&tree, 22, &["22-24"]);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25", "24-26"]);
|
||||
assert_search(&tree, 25, &["24-26", "25-27"]);
|
||||
assert_search(&tree, 26, &["25-27", "26-28"]);
|
||||
assert_search(&tree, 27, &["26-28"]);
|
||||
assert_search(&tree, 28, &["26-28"]);
|
||||
assert_search(&tree, 29, &["26-28"]);
|
||||
|
||||
tree.remove(&x24_26);
|
||||
tree.remove(&x26_28);
|
||||
assert_search(&tree, 23, &["22-24", "23-25"]);
|
||||
assert_search(&tree, 24, &["23-25"]);
|
||||
assert_search(&tree, 25, &["25-27"]);
|
||||
assert_search(&tree, 26, &["25-27"]);
|
||||
assert_search(&tree, 27, &["25-27"]);
|
||||
assert_search(&tree, 28, &["25-27"]);
|
||||
assert_search(&tree, 29, &["25-27"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_nested() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Items containing other items
|
||||
tree.insert(Arc::new(MockItem::new(31, 39)));
|
||||
tree.insert(Arc::new(MockItem::new(32, 34)));
|
||||
tree.insert(Arc::new(MockItem::new(33, 35)));
|
||||
tree.insert(Arc::new(MockItem::new(30, 40)));
|
||||
|
||||
assert_search(&tree, 30, &["30-40"]);
|
||||
assert_search(&tree, 31, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 32, &["30-40", "32-34", "31-39"]);
|
||||
assert_search(&tree, 33, &["30-40", "32-34", "33-35", "31-39"]);
|
||||
assert_search(&tree, 34, &["30-40", "33-35", "31-39"]);
|
||||
assert_search(&tree, 35, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 36, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 37, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 38, &["30-40", "31-39"]);
|
||||
assert_search(&tree, 39, &["30-40"]);
|
||||
assert_search(&tree, 40, &["30-40"]);
|
||||
assert_search(&tree, 41, &["30-40"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_interval_tree_duplicates() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Duplicate keys
|
||||
let item_a = Arc::new(MockItem::new_str(55, 56, "a"));
|
||||
tree.insert(Arc::clone(&item_a));
|
||||
let item_b = Arc::new(MockItem::new_str(55, 56, "b"));
|
||||
tree.insert(Arc::clone(&item_b));
|
||||
let item_c = Arc::new(MockItem::new_str(55, 56, "c"));
|
||||
tree.insert(Arc::clone(&item_c));
|
||||
let item_d = Arc::new(MockItem::new_str(54, 56, "d"));
|
||||
tree.insert(Arc::clone(&item_d));
|
||||
let item_e = Arc::new(MockItem::new_str(55, 57, "e"));
|
||||
tree.insert(Arc::clone(&item_e));
|
||||
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_search(
|
||||
&tree,
|
||||
55,
|
||||
&["55-56: a", "55-56: b", "55-56: c", "54-56: d", "55-57: e"],
|
||||
);
|
||||
tree.remove(&item_b);
|
||||
dump_tree(&tree);
|
||||
|
||||
assert_contents(&tree, &["54-56: d", "55-56: a", "55-56: c", "55-57: e"]);
|
||||
|
||||
tree.remove(&item_d);
|
||||
dump_tree(&tree);
|
||||
assert_contents(&tree, &["55-56: a", "55-56: c", "55-57: e"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_interval_tree_insert_twice() {
|
||||
let mut tree: IntervalTree<MockItem> = IntervalTree::default();
|
||||
|
||||
// Inserting the same item twice is not cool
|
||||
let item = Arc::new(MockItem::new(1, 2));
|
||||
tree.insert(Arc::clone(&item));
|
||||
tree.insert(Arc::clone(&item)); // fails assertion
|
||||
}
|
||||
}
|
||||
@@ -1,32 +1,29 @@
|
||||
//!
|
||||
//! The layer map tracks what layers exist for all the relishes in a timeline.
|
||||
//! The layer map tracks what layers exist in a timeline.
|
||||
//!
|
||||
//! When the timeline is first accessed, the server lists of all layer files
|
||||
//! in the timelines/<timelineid> directory, and populates this map with
|
||||
//! ImageLayer and DeltaLayer structs corresponding to each file. When new WAL
|
||||
//! is received, we create InMemoryLayers to hold the incoming records. Now and
|
||||
//! then, in the checkpoint() function, the in-memory layers are frozen, forming
|
||||
//! new image and delta layers and corresponding files are written to disk.
|
||||
//! ImageLayer and DeltaLayer structs corresponding to each file. When the first
|
||||
//! new WAL record is received, we create an InMemoryLayer to hold the incoming
|
||||
//! records. Now and then, in the checkpoint() function, the in-memory layer is
|
||||
//! are frozen, and it is split up into new image and delta layers and the
|
||||
//! corresponding files are written to disk.
|
||||
//!
|
||||
|
||||
use crate::layered_repository::interval_tree::{IntervalItem, IntervalIter, IntervalTree};
|
||||
use crate::layered_repository::storage_layer::{Layer, SegmentTag};
|
||||
use crate::layered_repository::storage_layer::Layer;
|
||||
use crate::layered_repository::storage_layer::{range_eq, range_overlaps};
|
||||
use crate::layered_repository::InMemoryLayer;
|
||||
use crate::relish::*;
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_INMEMORY_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
|
||||
.expect("failed to define a metric");
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_ondisk_layers", "Number of layers on-disk")
|
||||
.expect("failed to define a metric");
|
||||
@@ -37,98 +34,147 @@ lazy_static! {
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct LayerMap {
|
||||
/// All the layers keyed by segment tag
|
||||
segs: HashMap<SegmentTag, SegEntry>,
|
||||
//
|
||||
// 'open_layer' holds the current InMemoryLayer that is accepting new
|
||||
// records. If it is None, 'next_open_layer_at' will be set instead, indicating
|
||||
// where the start LSN of the next InMemoryLayer that is to be created.
|
||||
//
|
||||
pub open_layer: Option<Arc<InMemoryLayer>>,
|
||||
pub next_open_layer_at: Option<Lsn>,
|
||||
|
||||
/// All in-memory layers, ordered by 'oldest_lsn' and generation
|
||||
/// of each layer. This allows easy access to the in-memory layer that
|
||||
/// contains the oldest WAL record.
|
||||
open_layers: BinaryHeap<OpenLayerEntry>,
|
||||
///
|
||||
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
|
||||
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
|
||||
/// layer is during checkpointing, when an InMemoryLayer is being written out
|
||||
/// to disk.
|
||||
///
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// Generation number, used to distinguish newly inserted entries in the
|
||||
/// binary heap from older entries during checkpoint.
|
||||
current_generation: u64,
|
||||
/// All the historic layers are kept here
|
||||
|
||||
/// TODO: This is a placeholder implementation of a data structure
|
||||
/// to hold information about all the layer files on disk and in
|
||||
/// S3. Currently, it's just a vector and all operations perform a
|
||||
/// linear scan over it. That obviously becomes slow as the
|
||||
/// number of layers grows. I'm imagining that an R-tree or some
|
||||
/// other 2D data structure would be the long-term solution here.
|
||||
historic_layers: Vec<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
pub struct SearchResult {
|
||||
pub layer: Arc<dyn Layer>,
|
||||
pub lsn_floor: Lsn,
|
||||
}
|
||||
|
||||
impl LayerMap {
|
||||
///
|
||||
/// Look up a layer using the given segment tag and LSN. This differs from a
|
||||
/// plain key-value lookup in that if there is any layer that covers the
|
||||
/// given LSN, or precedes the given LSN, it is returned. In other words,
|
||||
/// you don't need to know the exact start LSN of the layer.
|
||||
/// Find the latest layer that covers the given 'key', with lsn <
|
||||
/// 'end_lsn'.
|
||||
///
|
||||
pub fn get(&self, tag: &SegmentTag, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry.get(lsn)
|
||||
}
|
||||
|
||||
/// Returns the layer, if any, and an 'lsn_floor' value that
|
||||
/// indicates which portion of the layer the caller should
|
||||
/// check. 'lsn_floor' is normally the start-LSN of the layer, but
|
||||
/// can be greater if there is an overlapping layer that might
|
||||
/// contain the version, even if it's missing from the returned
|
||||
/// layer.
|
||||
///
|
||||
/// Get the open layer for given segment for writing. Or None if no open
|
||||
/// layer exists.
|
||||
///
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
pub fn search(&self, key: Key, end_lsn: Lsn) -> Result<Option<SearchResult>> {
|
||||
// linear search
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
|
||||
segentry
|
||||
.open_layer_id
|
||||
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
|
||||
}
|
||||
if img_lsn >= end_lsn {
|
||||
// too new
|
||||
continue;
|
||||
}
|
||||
if Lsn(img_lsn.0 + 1) == end_lsn {
|
||||
// found exact match
|
||||
return Ok(Some(SearchResult {
|
||||
layer: Arc::clone(l),
|
||||
lsn_floor: img_lsn,
|
||||
}));
|
||||
}
|
||||
if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
|
||||
latest_img = Some(Arc::clone(l));
|
||||
latest_img_lsn = Some(img_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert an open in-memory layer
|
||||
///
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
let layer_id = segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let oldest_lsn = layer.get_oldest_lsn();
|
||||
|
||||
// After a crash and restart, 'oldest_lsn' of the oldest in-memory
|
||||
// layer becomes the WAL streaming starting point, so it better not point
|
||||
// in the middle of a WAL record.
|
||||
assert!(oldest_lsn.is_aligned());
|
||||
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
oldest_lsn: layer.get_oldest_lsn(),
|
||||
layer_id,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_layers.push(open_layer_entry);
|
||||
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove an open in-memory layer
|
||||
pub fn remove_open(&mut self, layer_id: LayerId) {
|
||||
// Note: we don't try to remove the entry from the binary heap.
|
||||
// It will be removed lazily by peek_oldest_open() when it's made it to
|
||||
// the top of the heap.
|
||||
|
||||
let layer_opt = {
|
||||
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
|
||||
let layer_opt = global_map.get(&layer_id);
|
||||
global_map.remove(&layer_id);
|
||||
// TODO it's bad that a ref can still exist after being evicted from cache
|
||||
layer_opt
|
||||
};
|
||||
|
||||
if let Some(layer) = layer_opt {
|
||||
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
|
||||
|
||||
if segentry.open_layer_id == Some(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
segentry.open_layer_id = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!layer.is_writeable());
|
||||
assert!(layer.is_dropped());
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
if l.get_lsn_range().start >= end_lsn {
|
||||
// too new
|
||||
continue;
|
||||
}
|
||||
|
||||
if l.get_lsn_range().end >= end_lsn {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
trace!(
|
||||
"found layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
}
|
||||
// this layer's end LSN is smaller than the requested point. If there's
|
||||
// nothing newer, this is what we need to return. Remember this.
|
||||
if let Some(ref old_candidate) = latest_delta {
|
||||
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
} else {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
}
|
||||
if let Some(l) = latest_delta {
|
||||
trace!(
|
||||
"found (old) layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
l.get_lsn_range().start,
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor,
|
||||
layer: l,
|
||||
}))
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!(
|
||||
"found img layer and no deltas for request on {} at {}",
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
trace!("no layer found for request on {} at {}", key, end_lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,9 +182,7 @@ impl LayerMap {
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
segentry.insert_historic(layer);
|
||||
|
||||
self.historic_layers.push(layer);
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -147,61 +191,62 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
#[allow(dead_code)]
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let tag = layer.get_seg_tag();
|
||||
let len_before = self.historic_layers.len();
|
||||
|
||||
if let Some(segentry) = self.segs.get_mut(&tag) {
|
||||
segentry.historic.remove(&layer);
|
||||
}
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.historic_layers
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
|
||||
assert_eq!(self.historic_layers.len(), len_before - 1);
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
// List relations along with a flag that marks if they exist at the given lsn.
|
||||
// spcnode 0 and dbnode 0 have special meanings and mean all tabespaces/databases.
|
||||
// Pass Tag if we're only interested in some relations.
|
||||
pub fn list_relishes(&self, tag: Option<RelTag>, lsn: Lsn) -> Result<HashMap<RelishTag, bool>> {
|
||||
let mut rels: HashMap<RelishTag, bool> = HashMap::new();
|
||||
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
match seg.rel {
|
||||
RelishTag::Relation(reltag) => {
|
||||
if let Some(request_rel) = tag {
|
||||
if (request_rel.spcnode == 0 || reltag.spcnode == request_rel.spcnode)
|
||||
&& (request_rel.dbnode == 0 || reltag.dbnode == request_rel.dbnode)
|
||||
{
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if tag == None {
|
||||
if let Some(exists) = segentry.exists_at_lsn(lsn)? {
|
||||
rels.insert(seg.rel, exists);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(rels)
|
||||
}
|
||||
|
||||
/// Is there a newer image layer for given segment?
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
/// We ignore segments newer than disk_consistent_lsn because they will be removed at restart
|
||||
/// We also only look at historic layers
|
||||
//#[allow(dead_code)]
|
||||
pub fn newer_image_layer_exists(
|
||||
&self,
|
||||
seg: SegmentTag,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> bool {
|
||||
if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.newer_image_layer_exists(lsn, disk_consistent_lsn)
|
||||
} else {
|
||||
false
|
||||
) -> Result<bool> {
|
||||
let mut range_remain = key_range.clone();
|
||||
|
||||
loop {
|
||||
let mut made_progress = false;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
if !l.is_incremental()
|
||||
&& l.get_key_range().contains(&range_remain.start)
|
||||
&& img_lsn > lsn
|
||||
&& img_lsn < disk_consistent_lsn
|
||||
{
|
||||
made_progress = true;
|
||||
let img_key_end = l.get_key_range().end;
|
||||
|
||||
if img_key_end >= range_remain.end {
|
||||
return Ok(true);
|
||||
}
|
||||
range_remain.start = img_key_end;
|
||||
}
|
||||
}
|
||||
|
||||
if !made_progress {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -211,284 +256,148 @@ impl LayerMap {
|
||||
/// used for garbage collection, to determine if some alive layer
|
||||
/// exists at the lsn. If so, we shouldn't delete a newer dropped layer
|
||||
/// to avoid incorrectly making it visible.
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.segs.get(&seg) {
|
||||
segentry.exists_at_lsn(lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
/*
|
||||
pub fn layer_exists_at_lsn(&self, seg: SegmentTag, lsn: Lsn) -> Result<bool> {
|
||||
Ok(if let Some(segentry) = self.historic_layers.get(&seg) {
|
||||
segentry.exists_at_lsn(seg, lsn)?.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
})
|
||||
}
|
||||
*/
|
||||
|
||||
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
|
||||
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||
/// newer than 'lsn'.
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
while let Some(oldest_entry) = self.open_layers.peek() {
|
||||
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
|
||||
} else {
|
||||
self.open_layers.pop();
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
if this_lsn > lsn {
|
||||
continue;
|
||||
}
|
||||
if this_lsn < candidate_lsn {
|
||||
// our previous candidate was better
|
||||
continue;
|
||||
}
|
||||
candidate_lsn = this_lsn;
|
||||
candidate = Some(Arc::clone(l));
|
||||
}
|
||||
|
||||
candidate
|
||||
}
|
||||
|
||||
///
|
||||
/// Divide the whole given range of keys into sub-ranges based on the latest
|
||||
/// image layer that covers each range. (This is used when creating new
|
||||
/// image layers)
|
||||
///
|
||||
// FIXME: clippy complains that the result type is very complex. She's probably
|
||||
// right...
|
||||
#[allow(clippy::type_complexity)]
|
||||
pub fn image_coverage(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
let mut points: Vec<Key>;
|
||||
|
||||
points = vec![key_range.start];
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.get_lsn_range().start > lsn {
|
||||
continue;
|
||||
}
|
||||
let range = l.get_key_range();
|
||||
if key_range.contains(&range.start) {
|
||||
points.push(l.get_key_range().start);
|
||||
}
|
||||
if key_range.contains(&range.end) {
|
||||
points.push(l.get_key_range().end);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
points.push(key_range.end);
|
||||
|
||||
/// Increment the generation number used to stamp open in-memory layers. Layers
|
||||
/// added with `insert_open` after this call will be associated with the new
|
||||
/// generation. Returns the new generation number.
|
||||
pub fn increment_generation(&mut self) -> u64 {
|
||||
self.current_generation += 1;
|
||||
self.current_generation
|
||||
}
|
||||
points.sort();
|
||||
points.dedup();
|
||||
|
||||
pub fn iter_historic_layers(&self) -> HistoricLayerIter {
|
||||
HistoricLayerIter {
|
||||
seg_iter: self.segs.iter(),
|
||||
iter: None,
|
||||
// Ok, we now have a list of "interesting" points in the key space
|
||||
|
||||
// For each range between the points, find the latest image
|
||||
let mut start = *points.first().unwrap();
|
||||
let mut ranges = Vec::new();
|
||||
for end in points[1..].iter() {
|
||||
let img = self.find_latest_image(start, lsn);
|
||||
|
||||
ranges.push((start..*end, img));
|
||||
|
||||
start = *end;
|
||||
}
|
||||
Ok(ranges)
|
||||
}
|
||||
|
||||
/// Count how many L1 delta layers there are that overlap with the
|
||||
/// given key and LSN range.
|
||||
pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
|
||||
let mut result = 0;
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_lsn_range(), lsn_range) {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// We ignore level0 delta layers. Unless the whole keyspace fits
|
||||
// into one partition
|
||||
if !range_eq(key_range, &(Key::MIN..Key::MAX))
|
||||
&& range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
result += 1;
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let mut deltas = Vec::new();
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if l.get_key_range() != (Key::MIN..Key::MAX) {
|
||||
continue;
|
||||
}
|
||||
deltas.push(Arc::clone(l));
|
||||
}
|
||||
Ok(deltas)
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
#[allow(unused)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open_layer_id {
|
||||
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
|
||||
layer.dump()?;
|
||||
} else {
|
||||
println!("layer not found in global map");
|
||||
}
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
for layer in self.historic_layers.iter() {
|
||||
layer.dump()?;
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl IntervalItem for dyn Layer {
|
||||
type Key = Lsn;
|
||||
|
||||
fn start_key(&self) -> Lsn {
|
||||
self.get_start_lsn()
|
||||
}
|
||||
fn end_key(&self) -> Lsn {
|
||||
self.get_end_lsn()
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Per-segment entry in the LayerMap::segs hash map. Holds all the layers
|
||||
/// associated with the segment.
|
||||
///
|
||||
/// The last layer that is open for writes is always an InMemoryLayer,
|
||||
/// and is kept in a separate field, because there can be only one for
|
||||
/// each segment. The older layers, stored on disk, are kept in an
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open_layer_id: Option<LayerId>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
}
|
||||
|
||||
impl SegEntry {
|
||||
/// Does the segment exist at given LSN?
|
||||
/// Return None if object is not found in this SegEntry.
|
||||
fn exists_at_lsn(&self, lsn: Lsn) -> Result<Option<bool>> {
|
||||
if let Some(layer) = self.get(lsn) {
|
||||
Ok(Some(layer.get_seg_exists(lsn)?))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open_layer_id) = &self.open_layer_id {
|
||||
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
|
||||
if open_layer.get_start_lsn() <= lsn {
|
||||
return Some(open_layer);
|
||||
}
|
||||
}
|
||||
|
||||
self.historic.search(lsn)
|
||||
}
|
||||
|
||||
pub fn newer_image_layer_exists(&self, lsn: Lsn, disk_consistent_lsn: Lsn) -> bool {
|
||||
// We only check on-disk layers, because
|
||||
// in-memory layers are not durable
|
||||
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
self.historic
|
||||
.iter_newer(lsn)
|
||||
.any(|layer| !layer.is_incremental() && layer.get_end_lsn() <= disk_consistent_lsn + 1)
|
||||
}
|
||||
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
if let Some(prev_open_layer_id) = &self.open_layer_id {
|
||||
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
|
||||
{
|
||||
assert!(!prev_open_layer.is_writeable());
|
||||
}
|
||||
}
|
||||
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
|
||||
self.open_layer_id = Some(open_layer_id);
|
||||
open_layer_id
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic.insert(layer);
|
||||
}
|
||||
}
|
||||
|
||||
/// Entry held in LayerMap::open_layers, with boilerplate comparison routines
|
||||
/// to implement a min-heap ordered by 'oldest_lsn' and 'generation'
|
||||
///
|
||||
/// The generation number associated with each entry can be used to distinguish
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
oldest_lsn: Lsn, // copy of layer.get_oldest_lsn()
|
||||
generation: u64,
|
||||
layer_id: LayerId,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
// BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
|
||||
// to get that. Entries with identical oldest_lsn are ordered by generation
|
||||
other
|
||||
.oldest_lsn
|
||||
.cmp(&self.oldest_lsn)
|
||||
.then_with(|| other.generation.cmp(&self.generation))
|
||||
}
|
||||
}
|
||||
impl PartialOrd for OpenLayerEntry {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
impl PartialEq for OpenLayerEntry {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
impl Eq for OpenLayerEntry {}
|
||||
|
||||
/// Iterator returned by LayerMap::iter_historic_layers()
|
||||
pub struct HistoricLayerIter<'a> {
|
||||
seg_iter: std::collections::hash_map::Iter<'a, SegmentTag, SegEntry>,
|
||||
iter: Option<IntervalIter<'a, dyn Layer>>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for HistoricLayerIter<'a> {
|
||||
type Item = Arc<dyn Layer>;
|
||||
|
||||
fn next(&mut self) -> std::option::Option<<Self as std::iter::Iterator>::Item> {
|
||||
loop {
|
||||
if let Some(x) = &mut self.iter {
|
||||
if let Some(x) = x.next() {
|
||||
return Some(Arc::clone(&x));
|
||||
}
|
||||
}
|
||||
if let Some((_tag, segentry)) = self.seg_iter.next() {
|
||||
self.iter = Some(segentry.historic.iter());
|
||||
continue;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::config::PageServerConf;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
lazy_static! {
|
||||
static ref DUMMY_TIMELINEID: ZTimelineId =
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
|
||||
static ref DUMMY_TENANTID: ZTenantId =
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap();
|
||||
}
|
||||
|
||||
/// Construct a dummy InMemoryLayer for testing
|
||||
fn dummy_inmem_layer(
|
||||
conf: &'static PageServerConf,
|
||||
segno: u32,
|
||||
start_lsn: Lsn,
|
||||
oldest_lsn: Lsn,
|
||||
) -> Arc<InMemoryLayer> {
|
||||
Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
*DUMMY_TIMELINEID,
|
||||
*DUMMY_TENANTID,
|
||||
SegmentTag {
|
||||
rel: TESTREL_A,
|
||||
segno,
|
||||
},
|
||||
start_lsn,
|
||||
oldest_lsn,
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_open_layers() -> Result<()> {
|
||||
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
|
||||
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
let gen1 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 0, Lsn(0x100), Lsn(0x100)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 1, Lsn(0x100), Lsn(0x200)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 2, Lsn(0x100), Lsn(0x120)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 3, Lsn(0x100), Lsn(0x110)));
|
||||
|
||||
let gen2 = layers.increment_generation();
|
||||
layers.insert_open(dummy_inmem_layer(conf, 4, Lsn(0x100), Lsn(0x110)));
|
||||
layers.insert_open(dummy_inmem_layer(conf, 5, Lsn(0x100), Lsn(0x100)));
|
||||
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
|
||||
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.remove_open(layer_id);
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
assert_pop_layer(5, gen2); // 0x100
|
||||
assert_pop_layer(3, gen1); // 0x110
|
||||
assert_pop_layer(4, gen2); // 0x110
|
||||
assert_pop_layer(2, gen1); // 0x120
|
||||
assert_pop_layer(1, gen1); // 0x200
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,9 +6,10 @@
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
|
||||
use std::{convert::TryInto, path::PathBuf};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::ensure;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
@@ -16,11 +17,13 @@ use zenith_utils::{
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::STORAGE_FORMAT_VERSION;
|
||||
|
||||
// Taken from PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SAFE_SIZE: usize = 512;
|
||||
const METADATA_CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
|
||||
const METADATA_MAX_DATA_SIZE: usize = METADATA_MAX_SAFE_SIZE - METADATA_CHECKSUM_SIZE;
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
///
|
||||
/// This is the same assumption that PostgreSQL makes with the control file,
|
||||
/// see PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SIZE: usize = 512;
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
@@ -30,6 +33,20 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
body: TimelineMetadataBody,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataHeader {
|
||||
checksum: u32, // CRC of serialized metadata body
|
||||
size: u16, // size of serialized metadata
|
||||
format_version: u16, // storage format version (used for compatibility checks)
|
||||
}
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBody {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
@@ -69,130 +86,90 @@ impl TimelineMetadata {
|
||||
initdb_lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: 0,
|
||||
size: 0,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
},
|
||||
body: TimelineMetadataBody {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
metadata_bytes.len() == METADATA_MAX_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
hdr.format_version == STORAGE_FORMAT_VERSION,
|
||||
"format version mismatch"
|
||||
);
|
||||
let metadata_size = hdr.size as usize;
|
||||
ensure!(
|
||||
metadata_size <= METADATA_MAX_SIZE,
|
||||
"corrupted metadata file"
|
||||
);
|
||||
let calculated_checksum = crc32c::crc32c(&metadata_bytes[METADATA_HDR_SIZE..metadata_size]);
|
||||
ensure!(
|
||||
hdr.checksum == calculated_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
ensure!(
|
||||
body.disk_consistent_lsn.is_aligned(),
|
||||
"disk_consistent_lsn is not aligned"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
|
||||
ensure!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
Ok(TimelineMetadata { hdr, body })
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
|
||||
let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
|
||||
ensure!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
let body_bytes = self.body.ser()?;
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
let hdr_bytes = hdr.ser()?;
|
||||
let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE];
|
||||
metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes);
|
||||
metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes);
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
self.body.disk_consistent_lsn
|
||||
}
|
||||
|
||||
pub fn prev_record_lsn(&self) -> Option<Lsn> {
|
||||
self.prev_record_lsn
|
||||
self.body.prev_record_lsn
|
||||
}
|
||||
|
||||
pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline
|
||||
self.body.ancestor_timeline
|
||||
}
|
||||
|
||||
pub fn ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
self.body.ancestor_lsn
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.latest_gc_cutoff_lsn
|
||||
self.body.latest_gc_cutoff_lsn
|
||||
}
|
||||
|
||||
pub fn initdb_lsn(&self) -> Lsn {
|
||||
self.initdb_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// This module is for direct conversion of metadata to bytes and back.
|
||||
/// For a certain metadata, besides the conversion a few verification steps has to
|
||||
/// be done, so all serde derives are hidden from the user, to avoid accidental
|
||||
/// verification-less metadata creation.
|
||||
mod serialize {
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
|
||||
|
||||
use super::TimelineMetadata;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SeTimelineMetadata<'a> {
|
||||
disk_consistent_lsn: &'a Lsn,
|
||||
prev_record_lsn: &'a Option<Lsn>,
|
||||
ancestor_timeline: &'a Option<ZTimelineId>,
|
||||
ancestor_lsn: &'a Lsn,
|
||||
latest_gc_cutoff_lsn: &'a Lsn,
|
||||
initdb_lsn: &'a Lsn,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
|
||||
fn from(other: &'a TimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: &other.disk_consistent_lsn,
|
||||
prev_record_lsn: &other.prev_record_lsn,
|
||||
ancestor_timeline: &other.ancestor_timeline,
|
||||
ancestor_lsn: &other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: &other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub(super) struct DeTimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<DeTimelineMetadata> for TimelineMetadata {
|
||||
fn from(other: DeTimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: other.disk_consistent_lsn,
|
||||
prev_record_lsn: other.prev_record_lsn,
|
||||
ancestor_timeline: other.ancestor_timeline,
|
||||
ancestor_lsn: other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
self.body.initdb_lsn
|
||||
}
|
||||
}
|
||||
|
||||
@@ -204,14 +181,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
let original_metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
latest_gc_cutoff_lsn: Lsn(0),
|
||||
initdb_lsn: Lsn(0),
|
||||
};
|
||||
let original_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
);
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
@@ -221,7 +198,7 @@ mod tests {
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata, original_metadata,
|
||||
deserialized_metadata.body, original_metadata.body,
|
||||
"Metadata that was serialized to bytes and deserialized back should not change"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2,139 +2,102 @@
|
||||
//! Common traits and structs for layers
|
||||
//!
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::{BlockNumber, ZenithWalRecord};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Size of one segment in pages (10 MB)
|
||||
pub const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
|
||||
///
|
||||
/// Each relish stored in the repository is divided into fixed-sized "segments",
|
||||
/// with 10 MB of key-space, or 1280 8k pages each.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct SegmentTag {
|
||||
pub rel: RelishTag,
|
||||
pub segno: u32,
|
||||
}
|
||||
|
||||
/// SegmentBlk represents a block number within a segment, or the size of segment.
|
||||
///
|
||||
/// This is separate from BlockNumber, which is used for block number within the
|
||||
/// whole relish. Since this is just a type alias, the compiler will let you mix
|
||||
/// them freely, but we use the type alias as documentation to make it clear
|
||||
/// which one we're dealing with.
|
||||
///
|
||||
/// (We could turn this into "struct SegmentBlk(u32)" to forbid accidentally
|
||||
/// assigning a BlockNumber to SegmentBlk or vice versa, but that makes
|
||||
/// operations more verbose).
|
||||
pub type SegmentBlk = u32;
|
||||
|
||||
impl fmt::Display for SegmentTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "{}.{}", self.rel, self.segno)
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialOrd<T>,
|
||||
{
|
||||
if a.start < b.start {
|
||||
a.end > b.start
|
||||
} else {
|
||||
b.end > a.start
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentTag {
|
||||
/// Given a relish and block number, calculate the corresponding segment and
|
||||
/// block number within the segment.
|
||||
pub const fn from_blknum(rel: RelishTag, blknum: BlockNumber) -> (SegmentTag, SegmentBlk) {
|
||||
(
|
||||
SegmentTag {
|
||||
rel,
|
||||
segno: blknum / RELISH_SEG_SIZE,
|
||||
},
|
||||
blknum % RELISH_SEG_SIZE,
|
||||
)
|
||||
}
|
||||
pub fn range_eq<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
T: PartialEq<T>,
|
||||
{
|
||||
a.start == b.start && a.end == b.end
|
||||
}
|
||||
|
||||
/// Struct used to communicate across calls to 'get_value_reconstruct_data'.
|
||||
///
|
||||
/// Represents a version of a page at a specific LSN. The LSN is the key of the
|
||||
/// entry in the 'page_versions' hash, it is not duplicated here.
|
||||
/// Before first call, you can fill in 'page_img' if you have an older cached
|
||||
/// version of the page available. That can save work in
|
||||
/// 'get_value_reconstruct_data', as it can stop searching for page versions
|
||||
/// when all the WAL records going back to the cached image have been collected.
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum PageVersion {
|
||||
Page(Bytes),
|
||||
Wal(ZenithWalRecord),
|
||||
}
|
||||
|
||||
///
|
||||
/// Struct used to communicate across calls to 'get_page_reconstruct_data'.
|
||||
///
|
||||
/// Before first call to get_page_reconstruct_data, you can fill in 'page_img'
|
||||
/// if you have an older cached version of the page available. That can save
|
||||
/// work in 'get_page_reconstruct_data', as it can stop searching for page
|
||||
/// versions when all the WAL records going back to the cached image have been
|
||||
/// collected.
|
||||
///
|
||||
/// When get_page_reconstruct_data returns Complete, 'page_img' is set to an
|
||||
/// image of the page, or the oldest WAL record in 'records' is a will_init-type
|
||||
/// When get_value_reconstruct_data returns Complete, 'img' is set to an image
|
||||
/// of the page, or the oldest WAL record in 'records' is a will_init-type
|
||||
/// record that initializes the page without requiring a previous image.
|
||||
///
|
||||
/// If 'get_page_reconstruct_data' returns Continue, some 'records' may have
|
||||
/// been collected, but there are more records outside the current layer. Pass
|
||||
/// the same PageReconstructData struct in the next 'get_page_reconstruct_data'
|
||||
/// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
|
||||
/// call, to collect more records.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
#[derive(Debug)]
|
||||
pub struct ValueReconstructState {
|
||||
pub records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
pub page_img: Option<(Lsn, Bytes)>,
|
||||
pub img: Option<(Lsn, Bytes)>,
|
||||
}
|
||||
|
||||
/// Return value from Layer::get_page_reconstruct_data
|
||||
pub enum PageReconstructResult {
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum ValueReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
Continue,
|
||||
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
Missing,
|
||||
}
|
||||
|
||||
/// A Layer contains all data in a "rectangle" consisting of a range of keys and
|
||||
/// range of LSNs.
|
||||
///
|
||||
/// A Layer corresponds to one RELISH_SEG_SIZE slice of a relish in a range of LSNs.
|
||||
/// There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
/// layers are used to ingest incoming WAL, and provide fast access
|
||||
/// to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
/// are immutable. This trait presents the common functionality of
|
||||
/// in-memory and on-disk layers.
|
||||
/// layers are used to ingest incoming WAL, and provide fast access to the
|
||||
/// recent page versions. On-disk layers are stored as files on disk, and are
|
||||
/// immutable. This trait presents the common functionality of in-memory and
|
||||
/// on-disk layers.
|
||||
///
|
||||
/// Furthermore, there are two kinds of on-disk layers: delta and image layers.
|
||||
/// A delta layer contains all modifications within a range of LSNs and keys.
|
||||
/// An image layer is a snapshot of all the data in a key-range, at a single
|
||||
/// LSN
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
/// Identify the timeline this layer belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
/// Identify the relish segment
|
||||
fn get_seg_tag(&self) -> SegmentTag;
|
||||
/// Range of segments that this layer covers
|
||||
fn get_key_range(&self) -> Range<Key>;
|
||||
|
||||
/// Inclusive start bound of the LSN range that this layer holds
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
/// Exclusive end bound of the LSN range that this layer holds.
|
||||
///
|
||||
/// - For an open in-memory layer, this is MAX_LSN.
|
||||
/// - For a frozen in-memory layer or a delta layer, this is a valid end bound.
|
||||
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
fn get_end_lsn(&self) -> Lsn;
|
||||
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
fn get_lsn_range(&self) -> Range<Lsn>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
@@ -153,18 +116,12 @@ pub trait Layer: Send + Sync {
|
||||
/// is available. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data' to
|
||||
/// collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
blknum: SegmentBlk,
|
||||
lsn: Lsn,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult>;
|
||||
|
||||
/// Return size of the segment at given LSN. (Only for blocky relations.)
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<SegmentBlk>;
|
||||
|
||||
/// Does the segment exist at given LSN? Or was it dropped before it.
|
||||
fn get_seg_exists(&self, lsn: Lsn) -> Result<bool>;
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_data: &mut ValueReconstructState,
|
||||
) -> Result<ValueReconstructResult>;
|
||||
|
||||
/// Does this layer only contain some data for the segment (incremental),
|
||||
/// or does it contain a version of every page? This is important to know
|
||||
@@ -175,6 +132,9 @@ pub trait Layer: Send + Sync {
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
|
||||
/// Iterate through all keys and values stored in the layer
|
||||
fn iter(&self) -> Box<dyn Iterator<Item = Result<(Key, Lsn, Value)>> + '_>;
|
||||
|
||||
/// Release memory used by this layer. There is no corresponding 'load'
|
||||
/// function, that's done implicitly when you call one of the get-functions.
|
||||
fn unload(&self) -> Result<()>;
|
||||
@@ -185,3 +145,36 @@ pub trait Layer: Send + Sync {
|
||||
/// Dump summary of the contents of the layer to stdout
|
||||
fn dump(&self) -> Result<()>;
|
||||
}
|
||||
|
||||
// Flag indicating that this version initialize the page
|
||||
const WILL_INIT: u64 = 1;
|
||||
|
||||
///
|
||||
/// Struct representing reference to BLOB in layers. Reference contains BLOB offset and size.
|
||||
/// For WAL records (delta layer) it also contains `will_init` flag which helps to determine range of records
|
||||
/// which needs to be applied without reading/deserializing records themselves.
|
||||
///
|
||||
#[derive(Debug, Serialize, Deserialize, Copy, Clone)]
|
||||
pub struct BlobRef(u64);
|
||||
|
||||
impl BlobRef {
|
||||
pub fn will_init(&self) -> bool {
|
||||
(self.0 & WILL_INIT) != 0
|
||||
}
|
||||
|
||||
pub fn pos(&self) -> u64 {
|
||||
self.0 >> 32
|
||||
}
|
||||
|
||||
pub fn size(&self) -> usize {
|
||||
((self.0 & 0xFFFFFFFF) >> 1) as usize
|
||||
}
|
||||
|
||||
pub fn new(pos: u64, size: usize, will_init: bool) -> BlobRef {
|
||||
let mut blob_ref = (pos << 32) | ((size as u64) << 1);
|
||||
if will_init {
|
||||
blob_ref |= WILL_INIT;
|
||||
}
|
||||
BlobRef(blob_ref)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,10 +2,12 @@ pub mod basebackup;
|
||||
pub mod config;
|
||||
pub mod http;
|
||||
pub mod import_datadir;
|
||||
pub mod keyspace;
|
||||
pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod reltag;
|
||||
pub mod remote_storage;
|
||||
pub mod repository;
|
||||
pub mod tenant_mgr;
|
||||
@@ -28,6 +30,20 @@ use zenith_utils::{
|
||||
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
|
||||
use layered_repository::LayeredRepository;
|
||||
use pgdatadir_mapping::DatadirTimeline;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
/// This is embedded in the metadata file, and also in the header of all the
|
||||
/// layer files. If you make any backwards-incompatible changes to the storage
|
||||
/// format, bump this!
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 1;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u32 = 0x5A60_0000 | STORAGE_FORMAT_VERSION as u32;
|
||||
pub const DELTA_FILE_MAGIC: u32 = 0x5A61_0000 | STORAGE_FORMAT_VERSION as u32;
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections_count",
|
||||
@@ -42,14 +58,16 @@ pub const LOG_FILE_NAME: &str = "pageserver.log";
|
||||
/// Config for the Repository checkpointer
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CheckpointConfig {
|
||||
// Flush in-memory data that is older than this
|
||||
Distance(u64),
|
||||
// Flush all in-memory data
|
||||
Flush,
|
||||
// Flush all in-memory data and reconstruct all page images
|
||||
Forced,
|
||||
}
|
||||
|
||||
pub type RepositoryImpl = LayeredRepository;
|
||||
|
||||
pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
|
||||
|
||||
pub fn shutdown_pageserver() {
|
||||
// Shut down the libpq endpoint thread. This prevents new connections from
|
||||
// being accepted.
|
||||
|
||||
@@ -53,7 +53,7 @@ use zenith_utils::{
|
||||
};
|
||||
|
||||
use crate::layered_repository::writeback_ephemeral_file;
|
||||
use crate::relish::RelTag;
|
||||
use crate::repository::Key;
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 10;
|
||||
@@ -105,8 +105,7 @@ enum CacheKey {
|
||||
struct MaterializedPageHashKey {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -291,16 +290,14 @@ impl PageCache {
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
let mut cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
key: *key,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
@@ -323,8 +320,7 @@ impl PageCache {
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
) {
|
||||
@@ -332,8 +328,7 @@ impl PageCache {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
key,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
@@ -32,7 +32,9 @@ use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::relish::*;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
@@ -398,8 +400,8 @@ impl PageServerHandler {
|
||||
/// In either case, if the page server hasn't received the WAL up to the
|
||||
/// requested LSN yet, we will wait for it to arrive. The return value is
|
||||
/// the LSN that should be used to look up the page versions.
|
||||
fn wait_or_get_last_lsn(
|
||||
timeline: &dyn Timeline,
|
||||
fn wait_or_get_last_lsn<R: Repository>(
|
||||
timeline: &DatadirTimeline<R>,
|
||||
mut lsn: Lsn,
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
|
||||
@@ -426,7 +428,7 @@ impl PageServerHandler {
|
||||
if lsn <= last_record_lsn {
|
||||
lsn = last_record_lsn;
|
||||
} else {
|
||||
timeline.wait_lsn(lsn)?;
|
||||
timeline.tline.wait_lsn(lsn)?;
|
||||
// Since we waited for 'lsn' to arrive, that is now the last
|
||||
// record LSN. (Or close enough for our purposes; the
|
||||
// last-record LSN can advance immediately after we return
|
||||
@@ -436,7 +438,7 @@ impl PageServerHandler {
|
||||
if lsn == Lsn(0) {
|
||||
bail!("invalid LSN(0) in request");
|
||||
}
|
||||
timeline.wait_lsn(lsn)?;
|
||||
timeline.tline.wait_lsn(lsn)?;
|
||||
}
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
@@ -446,54 +448,47 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
fn handle_get_rel_exists_request(
|
||||
fn handle_get_rel_exists_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let exists = timeline.get_rel_exists(tag, lsn)?;
|
||||
let exists = timeline.get_rel_exists(req.rel, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
|
||||
exists,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_nblocks_request(
|
||||
fn handle_get_nblocks_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let n_blocks = timeline.get_relish_size(tag, lsn)?;
|
||||
|
||||
// Return 0 if relation is not found.
|
||||
// This is what postgres smgr expects.
|
||||
let n_blocks = n_blocks.unwrap_or(0);
|
||||
let n_blocks = timeline.get_rel_size(req.rel, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
|
||||
n_blocks,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request(
|
||||
fn handle_get_page_at_lsn_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &dyn Timeline,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delayed causes the requests to
|
||||
@@ -503,7 +498,7 @@ impl PageServerHandler {
|
||||
std::thread::sleep(std::time::Duration::from_millis(1000));
|
||||
}
|
||||
*/
|
||||
let page = timeline.get_page_at_lsn(tag, req.blkno, lsn)?;
|
||||
let page = timeline.get_rel_page_at_lsn(req.rel, req.blkno, lsn)?;
|
||||
|
||||
Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
|
||||
page,
|
||||
@@ -523,7 +518,7 @@ impl PageServerHandler {
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
|
||||
@@ -701,67 +696,19 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_not_updated"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_needed_as_tombstone"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_removed"),
|
||||
RowDescriptor::int8_col(b"layer_nonrelfiles_dropped"),
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layers_not_updated"),
|
||||
RowDescriptor::int8_col(b"layers_removed"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.ondisk_relfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_relfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_relfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_relfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_total.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_cutoff
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_by_branches
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_not_updated.to_string().as_bytes()),
|
||||
Some(
|
||||
result
|
||||
.ondisk_nonrelfiles_needed_as_tombstone
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(result.ondisk_nonrelfiles_removed.to_string().as_bytes()),
|
||||
Some(result.ondisk_nonrelfiles_dropped.to_string().as_bytes()),
|
||||
Some(result.layers_total.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_branches.to_string().as_bytes()),
|
||||
Some(result.layers_not_updated.to_string().as_bytes()),
|
||||
Some(result.layers_removed.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -781,7 +728,14 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
timeline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
// Also compact it.
|
||||
//
|
||||
// FIXME: This probably shouldn't be part of a "checkpoint" command, but a
|
||||
// separate operation. Update the tests if you change this.
|
||||
timeline.tline.compact()?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
|
||||
1350
pageserver/src/pgdatadir_mapping.rs
Normal file
1350
pageserver/src/pgdatadir_mapping.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,226 +0,0 @@
|
||||
//!
|
||||
//! Zenith stores PostgreSQL relations, and some other files, in the
|
||||
//! repository. The relations (i.e. tables and indexes) take up most
|
||||
//! of the space in a typical installation, while the other files are
|
||||
//! small. We call each relation and other file that is stored in the
|
||||
//! repository a "relish". It comes from "rel"-ish, as in "kind of a
|
||||
//! rel", because it covers relations as well as other things that are
|
||||
//! not relations, but are treated similarly for the purposes of the
|
||||
//! storage layer.
|
||||
//!
|
||||
//! This source file contains the definition of the RelishTag struct,
|
||||
//! which uniquely identifies a relish.
|
||||
//!
|
||||
//! Relishes come in two flavors: blocky and non-blocky. Relations and
|
||||
//! SLRUs are blocky, that is, they are divided into 8k blocks, and
|
||||
//! the repository tracks their size. Other relishes are non-blocky:
|
||||
//! the content of the whole relish is stored as one blob. Block
|
||||
//! number must be passed as 0 for all operations on a non-blocky
|
||||
//! relish. The one "block" that you store in a non-blocky relish can
|
||||
//! have arbitrary size, but they are expected to be small, or you
|
||||
//! will have performance issues.
|
||||
//!
|
||||
//! All relishes are versioned by LSN in the repository.
|
||||
//!
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
|
||||
///
|
||||
/// RelishTag identifies one relish.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum RelishTag {
|
||||
// Relations correspond to PostgreSQL relation forks. Each
|
||||
// PostgreSQL relation fork is considered a separate relish.
|
||||
Relation(RelTag),
|
||||
|
||||
// SLRUs include pg_clog, pg_multixact/members, and
|
||||
// pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
// they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
// or we do not support them in zenith yet (pg_commit_ts).
|
||||
//
|
||||
// These are currently never requested directly by the compute
|
||||
// nodes, although in principle that would be possible. However,
|
||||
// when a new compute node is created, these are included in the
|
||||
// tarball that we send to the compute node to initialize the
|
||||
// PostgreSQL data directory.
|
||||
//
|
||||
// Each SLRU segment in PostgreSQL is considered a separate
|
||||
// relish. For example, pg_clog/0000, pg_clog/0001, and so forth.
|
||||
//
|
||||
// SLRU segments are divided into blocks, like relations.
|
||||
Slru { slru: SlruKind, segno: u32 },
|
||||
|
||||
// Miscellaneous other files that need to be included in the
|
||||
// tarball at compute node creation. These are non-blocky, and are
|
||||
// expected to be small.
|
||||
|
||||
//
|
||||
// FileNodeMap represents PostgreSQL's 'pg_filenode.map'
|
||||
// files. They are needed to map catalog table OIDs to filenode
|
||||
// numbers. Usually the mapping is done by looking up a relation's
|
||||
// 'relfilenode' field in the 'pg_class' system table, but that
|
||||
// doesn't work for 'pg_class' itself and a few other such system
|
||||
// relations. See PostgreSQL relmapper.c for details.
|
||||
//
|
||||
// Each database has a map file for its local mapped catalogs,
|
||||
// and there is a separate map file for shared catalogs.
|
||||
//
|
||||
// These files are always 512 bytes long (although we don't check
|
||||
// or care about that in the page server).
|
||||
//
|
||||
FileNodeMap { spcnode: Oid, dbnode: Oid },
|
||||
|
||||
//
|
||||
// State files for prepared transactions (e.g pg_twophase/1234)
|
||||
//
|
||||
TwoPhase { xid: TransactionId },
|
||||
|
||||
// The control file, stored in global/pg_control
|
||||
ControlFile,
|
||||
|
||||
// Special entry that represents PostgreSQL checkpoint. It doesn't
|
||||
// correspond to to any physical file in PostgreSQL, but we use it
|
||||
// to track fields needed to restore the checkpoint data in the
|
||||
// control file, when a compute node is created.
|
||||
Checkpoint,
|
||||
}
|
||||
|
||||
impl RelishTag {
|
||||
pub const fn is_blocky(&self) -> bool {
|
||||
match self {
|
||||
// These relishes work with blocks
|
||||
RelishTag::Relation(_) | RelishTag::Slru { slru: _, segno: _ } => true,
|
||||
|
||||
// and these don't
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: _,
|
||||
dbnode: _,
|
||||
}
|
||||
| RelishTag::TwoPhase { xid: _ }
|
||||
| RelishTag::ControlFile
|
||||
| RelishTag::Checkpoint => false,
|
||||
}
|
||||
}
|
||||
|
||||
// Physical relishes represent files and use
|
||||
// RelationSizeEntry to track existing and dropped files.
|
||||
// They can be both blocky and non-blocky.
|
||||
pub const fn is_physical(&self) -> bool {
|
||||
match self {
|
||||
// These relishes represent physical files
|
||||
RelishTag::Relation(_)
|
||||
| RelishTag::Slru { .. }
|
||||
| RelishTag::FileNodeMap { .. }
|
||||
| RelishTag::TwoPhase { .. } => true,
|
||||
|
||||
// and these don't
|
||||
RelishTag::ControlFile | RelishTag::Checkpoint => false,
|
||||
}
|
||||
}
|
||||
|
||||
// convenience function to check if this relish is a normal relation.
|
||||
pub const fn is_relation(&self) -> bool {
|
||||
matches!(self, RelishTag::Relation(_))
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Hash, Ord, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
pub dbnode: Oid,
|
||||
pub relnode: Oid,
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelishTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
RelishTag::Relation(rel) => rel.fmt(f),
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
// e.g. pg_clog/0001
|
||||
write!(f, "{}/{:04X}", slru.to_str(), segno)
|
||||
}
|
||||
RelishTag::FileNodeMap { spcnode, dbnode } => {
|
||||
write!(f, "relmapper file for spc {} db {}", spcnode, dbnode)
|
||||
}
|
||||
RelishTag::TwoPhase { xid } => {
|
||||
write!(f, "pg_twophase/{:08X}", xid)
|
||||
}
|
||||
RelishTag::ControlFile => {
|
||||
write!(f, "control file")
|
||||
}
|
||||
RelishTag::Checkpoint => {
|
||||
write!(f, "checkpoint")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
||||
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
||||
/// hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
impl SlruKind {
|
||||
pub fn to_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Clog => "pg_xact",
|
||||
Self::MultiXactMembers => "pg_multixact/members",
|
||||
Self::MultiXactOffsets => "pg_multixact/offsets",
|
||||
}
|
||||
}
|
||||
}
|
||||
105
pageserver/src/reltag.rs
Normal file
105
pageserver/src/reltag.rs
Normal file
@@ -0,0 +1,105 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::Oid;
|
||||
|
||||
///
|
||||
/// Relation data file segment id throughout the Postgres cluster.
|
||||
///
|
||||
/// Every data file in Postgres is uniquely identified by 4 numbers:
|
||||
/// - relation id / node (`relnode`)
|
||||
/// - database id (`dbnode`)
|
||||
/// - tablespace id (`spcnode`), in short this is a unique id of a separate
|
||||
/// directory to store data files.
|
||||
/// - forknumber (`forknum`) is used to split different kinds of data of the same relation
|
||||
/// between some set of files (`relnode`, `relnode_fsm`, `relnode_vm`).
|
||||
///
|
||||
/// In native Postgres code `RelFileNode` structure and individual `ForkNumber` value
|
||||
/// are used for the same purpose.
|
||||
/// [See more related comments here](https:///github.com/postgres/postgres/blob/99c5852e20a0987eca1c38ba0c09329d4076b6a0/src/include/storage/relfilenode.h#L57).
|
||||
///
|
||||
// FIXME: should move 'forknum' as last field to keep this consistent with Postgres.
|
||||
// Then we could replace the custo Ord and PartialOrd implementations below with
|
||||
// deriving them.
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct RelTag {
|
||||
pub forknum: u8,
|
||||
pub spcnode: Oid,
|
||||
pub dbnode: Oid,
|
||||
pub relnode: Oid,
|
||||
}
|
||||
|
||||
impl PartialOrd for RelTag {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for RelTag {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
let mut cmp;
|
||||
|
||||
cmp = self.spcnode.cmp(&other.spcnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.dbnode.cmp(&other.dbnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.relnode.cmp(&other.relnode);
|
||||
if cmp != Ordering::Equal {
|
||||
return cmp;
|
||||
}
|
||||
cmp = self.forknum.cmp(&other.forknum);
|
||||
|
||||
cmp
|
||||
}
|
||||
}
|
||||
|
||||
/// Display RelTag in the same format that's used in most PostgreSQL debug messages:
|
||||
///
|
||||
/// <spcnode>/<dbnode>/<relnode>[_fsm|_vm|_init]
|
||||
///
|
||||
impl fmt::Display for RelTag {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if let Some(forkname) = forknumber_to_name(self.forknum) {
|
||||
write!(
|
||||
f,
|
||||
"{}/{}/{}_{}",
|
||||
self.spcnode, self.dbnode, self.relnode, forkname
|
||||
)
|
||||
} else {
|
||||
write!(f, "{}/{}/{}", self.spcnode, self.dbnode, self.relnode)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Non-relation transaction status files (clog (a.k.a. pg_xact) and
|
||||
/// pg_multixact) in Postgres are handled by SLRU (Simple LRU) buffer,
|
||||
/// hence the name.
|
||||
///
|
||||
/// These files are global for a postgres instance.
|
||||
///
|
||||
/// These files are divided into segments, which are divided into
|
||||
/// pages of the same BLCKSZ as used for relation files.
|
||||
///
|
||||
#[derive(Debug, Clone, Copy, Hash, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub enum SlruKind {
|
||||
Clog,
|
||||
MultiXactMembers,
|
||||
MultiXactOffsets,
|
||||
}
|
||||
|
||||
impl SlruKind {
|
||||
pub fn to_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Clog => "pg_xact",
|
||||
Self::MultiXactMembers => "pg_multixact/members",
|
||||
Self::MultiXactOffsets => "pg_multixact/offsets",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,7 +17,7 @@ This way, the backups are managed in background, not affecting directly other pa
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* synchronizes the differences with local timelines and remote states as fast as possible
|
||||
* uploads new relishes, frozen by pageserver checkpoint thread
|
||||
* uploads new layer files
|
||||
* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
|
||||
* uses compression when deals with files, for better S3 usage
|
||||
* maintains an index of what's stored remotely
|
||||
|
||||
@@ -662,7 +662,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
harness: &RepoHarness,
|
||||
harness: &RepoHarness<'_>,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
//! it may schedule the download on such occasions.
|
||||
//! Then, the index is shared across pageserver under [`RemoteIndex`] guard to ensure proper synchronization.
|
||||
//!
|
||||
//! The synchronization unit is an archive: a set of timeline files (or relishes) and a special metadata file, all compressed into a blob.
|
||||
//! The synchronization unit is an archive: a set of layer files and a special metadata file, all compressed into a blob.
|
||||
//! Currently, there's no way to process an archive partially, if the archive processing fails, it has to be started from zero next time again.
|
||||
//! An archive contains set of files of a certain timeline, added during checkpoint(s) and the timeline metadata at that moment.
|
||||
//! The archive contains that metadata's `disk_consistent_lsn` in its name, to be able to restore partial index information from just a remote storage file list.
|
||||
@@ -281,7 +281,7 @@ impl SyncKind {
|
||||
/// Current checkpoint design assumes new files are added only, no deletions or amendment happens.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct NewCheckpoint {
|
||||
/// Relish file paths in the pageserver workdir, that were added for the corresponding checkpoint.
|
||||
/// layer file paths in the pageserver workdir, that were added for the corresponding checkpoint.
|
||||
layers: Vec<PathBuf>,
|
||||
metadata: TimelineMetadata,
|
||||
}
|
||||
@@ -854,7 +854,7 @@ mod test_utils {
|
||||
|
||||
#[track_caller]
|
||||
pub async fn ensure_correct_timeline_upload(
|
||||
harness: &RepoHarness,
|
||||
harness: &RepoHarness<'_>,
|
||||
remote_assets: Arc<(LocalFs, RemoteIndex)>,
|
||||
timeline_id: ZTimelineId,
|
||||
new_upload: NewCheckpoint,
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
|
||||
//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
|
||||
//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
|
||||
//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other relishes are decompressed successfully first.
|
||||
//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first.
|
||||
//!
|
||||
//! Archive structure:
|
||||
//! +----------------------------------------+
|
||||
|
||||
@@ -277,7 +277,7 @@ impl RemoteTimeline {
|
||||
.map(CheckpointArchive::disk_consistent_lsn)
|
||||
}
|
||||
|
||||
/// Lists all relish files in the given remote timeline. Omits the metadata file.
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
|
||||
self.timeline_files
|
||||
.values()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -4,13 +4,13 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::remote_storage::RemoteIndex;
|
||||
use crate::repository::{Repository, Timeline, TimelineSyncStatusUpdate};
|
||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::timelines;
|
||||
use crate::timelines::CreateRepo;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
@@ -28,7 +28,9 @@ lazy_static! {
|
||||
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
repo: Arc<dyn Repository>,
|
||||
repo: Arc<RepositoryImpl>,
|
||||
|
||||
timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -67,14 +69,14 @@ pub fn load_local_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> Arc<dyn Repository> {
|
||||
) -> Arc<RepositoryImpl> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<dyn Repository> = Arc::new(LayeredRepository::new(
|
||||
let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
@@ -84,6 +86,7 @@ pub fn load_local_repo(
|
||||
Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
});
|
||||
Arc::clone(&tenant.repo)
|
||||
@@ -138,7 +141,7 @@ pub fn shutdown_all_tenants() {
|
||||
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiver), None, None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), None, None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);
|
||||
|
||||
// Ok, no background threads running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
@@ -152,7 +155,7 @@ pub fn shutdown_all_tenants() {
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
match get_repository_for_tenant(tenantid) {
|
||||
Ok(repo) => {
|
||||
if let Err(err) = repo.checkpoint_iteration(CheckpointConfig::Flush) {
|
||||
if let Err(err) = repo.checkpoint() {
|
||||
error!(
|
||||
"Could not checkpoint tenant {} during shutdown: {:?}",
|
||||
tenantid, err
|
||||
@@ -192,6 +195,7 @@ pub fn create_tenant_repository(
|
||||
v.insert(Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
timelines: HashMap::new(),
|
||||
});
|
||||
Ok(Some(tenantid))
|
||||
}
|
||||
@@ -203,7 +207,7 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
|
||||
}
|
||||
|
||||
///
|
||||
/// Change the state of a tenant to Active and launch its checkpointer and GC
|
||||
/// Change the state of a tenant to Active and launch its compactor and GC
|
||||
/// threads. If the tenant was already in Active state or Stopping, does nothing.
|
||||
///
|
||||
pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> {
|
||||
@@ -218,15 +222,15 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R
|
||||
// If the tenant is already active, nothing to do.
|
||||
TenantState::Active => {}
|
||||
|
||||
// If it's Idle, launch the checkpointer and GC threads
|
||||
// If it's Idle, launch the compactor and GC threads
|
||||
TenantState::Idle => {
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::Checkpointer,
|
||||
ThreadKind::Compactor,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"Checkpointer thread",
|
||||
"Compactor thread",
|
||||
true,
|
||||
move || crate::tenant_threads::checkpoint_loop(tenant_id, conf),
|
||||
move || crate::tenant_threads::compact_loop(tenant_id, conf),
|
||||
)?;
|
||||
|
||||
let gc_spawn_result = thread_mgr::spawn(
|
||||
@@ -244,7 +248,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R
|
||||
"Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}",
|
||||
tenant_id, e
|
||||
);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Checkpointer), Some(tenant_id), None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
||||
return gc_spawn_result;
|
||||
}
|
||||
|
||||
@@ -258,7 +262,7 @@ pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> R
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryImpl>> {
|
||||
let m = access_tenants();
|
||||
let tenant = m
|
||||
.get(&tenantid)
|
||||
@@ -271,10 +275,27 @@ pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Reposito
|
||||
pub fn get_timeline_for_tenant_load(
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
get_repository_for_tenant(tenantid)?
|
||||
) -> Result<Arc<DatadirTimelineImpl>> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m
|
||||
.get_mut(&tenantid)
|
||||
.with_context(|| format!("Tenant {} not found", tenantid))?;
|
||||
|
||||
if let Some(page_tline) = tenant.timelines.get(&timelineid) {
|
||||
return Ok(Arc::clone(page_tline));
|
||||
}
|
||||
// First access to this timeline. Create a DatadirTimeline wrapper for it
|
||||
let tline = tenant
|
||||
.repo
|
||||
.get_timeline_load(timelineid)
|
||||
.with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))
|
||||
.with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?;
|
||||
|
||||
let repartition_distance = tenant.repo.conf.checkpoint_distance / 10;
|
||||
|
||||
let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance));
|
||||
page_tline.init_logical_size()?;
|
||||
tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
|
||||
Ok(page_tline)
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
|
||||
@@ -1,34 +1,42 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as checkpointer and GC
|
||||
//! such as compaction and GC
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::Result;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
///
|
||||
/// Checkpointer thread's main loop
|
||||
/// Compaction thread's main loop
|
||||
///
|
||||
pub fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
if let Err(err) = compact_loop_ext(tenantid, conf) {
|
||||
error!("compact loop terminated with error: {:?}", err);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
|
||||
std::thread::sleep(conf.checkpoint_period);
|
||||
trace!("checkpointer thread for tenant {} waking up", tenantid);
|
||||
std::thread::sleep(conf.compaction_period);
|
||||
trace!("compaction thread for tenant {} waking up", tenantid);
|
||||
|
||||
// checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
|
||||
// bytes of WAL since last checkpoint.
|
||||
// Compact timelines
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
|
||||
repo.compaction_iteration()?;
|
||||
}
|
||||
|
||||
trace!(
|
||||
"checkpointer thread stopped for tenant {} state is {:?}",
|
||||
"compaction thread stopped for tenant {} state is {:?}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
|
||||
@@ -94,13 +94,16 @@ pub enum ThreadKind {
|
||||
// Thread that connects to a safekeeper to fetch WAL for one timeline.
|
||||
WalReceiver,
|
||||
|
||||
// Thread that handles checkpointing of all timelines for a tenant.
|
||||
Checkpointer,
|
||||
// Thread that handles compaction of all timelines for a tenant.
|
||||
Compactor,
|
||||
|
||||
// Thread that handles GC of a tenant
|
||||
GarbageCollector,
|
||||
|
||||
// Thread for synchronizing pageserver relish data with the remote storage.
|
||||
// Thread that flushes frozen in-memory layers to disk
|
||||
LayerFlushThread,
|
||||
|
||||
// Thread for synchronizing pageserver layer files with the remote storage.
|
||||
// Shared by all tenants.
|
||||
StorageSync,
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ use crate::{
|
||||
layered_repository::metadata::TimelineMetadata,
|
||||
remote_storage::RemoteIndex,
|
||||
repository::{LocalTimelineState, Repository},
|
||||
DatadirTimeline, RepositoryImpl,
|
||||
};
|
||||
use crate::{import_datadir, LOG_FILE_NAME};
|
||||
use crate::{layered_repository::LayeredRepository, walredo::WalRedoManager};
|
||||
@@ -48,26 +49,26 @@ pub struct LocalTimelineInfo {
|
||||
}
|
||||
|
||||
impl LocalTimelineInfo {
|
||||
pub fn from_loaded_timeline(
|
||||
timeline: &dyn Timeline,
|
||||
pub fn from_loaded_timeline<R: Repository>(
|
||||
datadir_tline: &DatadirTimeline<R>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
let last_record_lsn = timeline.get_last_record_lsn();
|
||||
let last_record_lsn = datadir_tline.tline.get_last_record_lsn();
|
||||
let info = LocalTimelineInfo {
|
||||
ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
|
||||
ancestor_timeline_id: datadir_tline.tline.get_ancestor_timeline_id(),
|
||||
ancestor_lsn: {
|
||||
match timeline.get_ancestor_lsn() {
|
||||
match datadir_tline.tline.get_ancestor_lsn() {
|
||||
Lsn(0) => None,
|
||||
lsn @ Lsn(_) => Some(lsn),
|
||||
}
|
||||
},
|
||||
disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
|
||||
disk_consistent_lsn: datadir_tline.tline.get_disk_consistent_lsn(),
|
||||
last_record_lsn,
|
||||
prev_record_lsn: Some(timeline.get_prev_record_lsn()),
|
||||
prev_record_lsn: Some(datadir_tline.tline.get_prev_record_lsn()),
|
||||
timeline_state: LocalTimelineState::Loaded,
|
||||
current_logical_size: Some(timeline.get_current_logical_size()),
|
||||
current_logical_size: Some(datadir_tline.get_current_logical_size()),
|
||||
current_logical_size_non_incremental: if include_non_incremental_logical_size {
|
||||
Some(timeline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
||||
Some(datadir_tline.get_current_logical_size_non_incremental(last_record_lsn)?)
|
||||
} else {
|
||||
None
|
||||
},
|
||||
@@ -93,17 +94,19 @@ impl LocalTimelineInfo {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_repo_timeline(
|
||||
repo_timeline: RepositoryTimeline,
|
||||
pub fn from_repo_timeline<T>(
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
repo_timeline: &RepositoryTimeline<T>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> anyhow::Result<Self> {
|
||||
match repo_timeline {
|
||||
RepositoryTimeline::Loaded(timeline) => {
|
||||
Self::from_loaded_timeline(timeline.as_ref(), include_non_incremental_logical_size)
|
||||
}
|
||||
RepositoryTimeline::Unloaded { metadata } => {
|
||||
Ok(Self::from_unloaded_timeline(&metadata))
|
||||
RepositoryTimeline::Loaded(_) => {
|
||||
let datadir_tline =
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?;
|
||||
Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size)
|
||||
}
|
||||
RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -172,7 +175,7 @@ pub fn create_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
create_repo: CreateRepo,
|
||||
) -> Result<Arc<dyn Repository>> {
|
||||
) -> Result<Arc<RepositoryImpl>> {
|
||||
let (wal_redo_manager, remote_index) = match create_repo {
|
||||
CreateRepo::Real {
|
||||
wal_redo_manager,
|
||||
@@ -260,12 +263,12 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
// - run initdb to init temporary instance and get bootstrap data
|
||||
// - after initialization complete, remove the temp dir.
|
||||
//
|
||||
fn bootstrap_timeline(
|
||||
fn bootstrap_timeline<R: Repository>(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
tli: ZTimelineId,
|
||||
repo: &dyn Repository,
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
repo: &R,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
@@ -281,23 +284,20 @@ fn bootstrap_timeline(
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
import_datadir::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
let mut page_tline: DatadirTimeline<R> = DatadirTimeline::new(timeline, u64::MAX);
|
||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &mut page_tline, lsn)?;
|
||||
page_tline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
println!(
|
||||
"created initial timeline {} timeline.lsn {}",
|
||||
tli,
|
||||
timeline.get_last_record_lsn()
|
||||
page_tline.tline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
// Remove temp dir. We don't need it anymore
|
||||
fs::remove_dir_all(pgdata_path)?;
|
||||
|
||||
Ok(timeline)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_local_timelines(
|
||||
@@ -313,7 +313,9 @@ pub(crate) fn get_local_timelines(
|
||||
local_timeline_info.push((
|
||||
timeline_id,
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
repository_timeline,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
&repository_timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)?,
|
||||
))
|
||||
@@ -372,13 +374,17 @@ pub(crate) fn create_timeline(
|
||||
}
|
||||
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
|
||||
// load the timeline into memory
|
||||
let loaded_timeline = repo.get_timeline_load(new_timeline_id)?;
|
||||
LocalTimelineInfo::from_loaded_timeline(loaded_timeline.as_ref(), false)
|
||||
let loaded_timeline =
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
|
||||
LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false)
|
||||
.context("cannot fill timeline info")?
|
||||
}
|
||||
None => {
|
||||
let new_timeline = bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
|
||||
LocalTimelineInfo::from_loaded_timeline(new_timeline.as_ref(), false)
|
||||
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
|
||||
// load the timeline into memory
|
||||
let new_timeline =
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
|
||||
LocalTimelineInfo::from_loaded_timeline(&new_timeline, false)
|
||||
.context("cannot fill timeline info")?
|
||||
}
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -6,6 +6,7 @@
|
||||
//! We keep one WAL receiver active per timeline.
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -182,13 +183,13 @@ fn walreceiver_main(
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
|
||||
.with_context(|| format!("no repository found for tenant {}", tenant_id))?;
|
||||
let timeline = repo.get_timeline_load(timeline_id).with_context(|| {
|
||||
format!(
|
||||
"local timeline {} not found for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
)
|
||||
})?;
|
||||
|
||||
let timeline =
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| {
|
||||
format!(
|
||||
"local timeline {} not found for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
)
|
||||
})?;
|
||||
let remote_index = repo.get_remote_index();
|
||||
|
||||
//
|
||||
@@ -251,11 +252,10 @@ fn walreceiver_main(
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hittind a deadlock.
|
||||
// at risk of hitting a deadlock.
|
||||
anyhow::ensure!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
walingest.ingest_record(writer.as_ref(), recdata, lsn)?;
|
||||
walingest.ingest_record(&timeline, recdata, lsn)?;
|
||||
|
||||
fail_point!("walreceiver-after-ingest");
|
||||
|
||||
@@ -267,6 +267,8 @@ fn walreceiver_main(
|
||||
caught_up = true;
|
||||
}
|
||||
|
||||
timeline.tline.check_checkpoint_distance()?;
|
||||
|
||||
Some(endlsn)
|
||||
}
|
||||
|
||||
@@ -310,7 +312,7 @@ fn walreceiver_main(
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let write_lsn = u64::from(last_lsn);
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees local persistence of all received data
|
||||
let flush_lsn = u64::from(timeline.get_disk_consistent_lsn());
|
||||
let flush_lsn = u64::from(timeline.tline.get_disk_consistent_lsn());
|
||||
// The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash
|
||||
// Used by safekeepers to remove WAL preceding `remote_consistent_lsn`.
|
||||
let apply_lsn = u64::from(timeline_remote_consistent_lsn);
|
||||
|
||||
@@ -10,7 +10,47 @@ use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, Transacti
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
|
||||
use crate::repository::ZenithWalRecord;
|
||||
/// Each update to a page is represented by a ZenithWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom zenith-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum ZenithWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted { xids: Vec<TransactionId> },
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
}
|
||||
|
||||
impl ZenithWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
match self {
|
||||
ZenithWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
|
||||
// None of the special zenith record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// DecodedBkpBlock represents per-page data contained in a WAL record.
|
||||
#[derive(Default)]
|
||||
@@ -87,6 +127,28 @@ impl XlRelmapUpdate {
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrCreate {
|
||||
pub rnode: RelFileNode,
|
||||
// FIXME: This is ForkNumber in storage_xlog.h. That's an enum. Does it have
|
||||
// well-defined size?
|
||||
pub forknum: u8,
|
||||
}
|
||||
|
||||
impl XlSmgrCreate {
|
||||
pub fn decode(buf: &mut Bytes) -> XlSmgrCreate {
|
||||
XlSmgrCreate {
|
||||
rnode: RelFileNode {
|
||||
spcnode: buf.get_u32_le(), /* tablespace */
|
||||
dbnode: buf.get_u32_le(), /* database */
|
||||
relnode: buf.get_u32_le(), /* relation */
|
||||
},
|
||||
forknum: buf.get_u32_le() as u8,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct XlSmgrTruncate {
|
||||
|
||||
@@ -42,8 +42,10 @@ use zenith_utils::nonblock::set_nonblock;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::relish::*;
|
||||
use crate::repository::ZenithWalRecord;
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
|
||||
@@ -75,8 +77,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
/// the reords.
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -92,8 +93,7 @@ pub struct DummyRedoManager {}
|
||||
impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
_rel: RelishTag,
|
||||
_blknum: u32,
|
||||
_key: Key,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -152,28 +152,6 @@ fn can_apply_in_zenith(rec: &ZenithWalRecord) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
fn check_forknum(rel: &RelishTag, expected_forknum: u8) -> bool {
|
||||
if let RelishTag::Relation(RelTag {
|
||||
forknum,
|
||||
spcnode: _,
|
||||
dbnode: _,
|
||||
relnode: _,
|
||||
}) = rel
|
||||
{
|
||||
*forknum == expected_forknum
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn check_slru_segno(rel: &RelishTag, expected_slru: SlruKind, expected_segno: u32) -> bool {
|
||||
if let RelishTag::Slru { slru, segno } = rel {
|
||||
*slru == expected_slru && *segno == expected_segno
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
@@ -184,6 +162,8 @@ pub enum WalRedoError {
|
||||
InvalidState,
|
||||
#[error("cannot perform WAL redo for this request")]
|
||||
InvalidRequest,
|
||||
#[error("cannot perform WAL redo for this record")]
|
||||
InvalidRecord,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -198,8 +178,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
///
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
@@ -217,11 +196,10 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
|
||||
if rec_zenith != batch_zenith {
|
||||
let result = if batch_zenith {
|
||||
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..i])
|
||||
self.apply_batch_zenith(key, lsn, img, &records[batch_start..i])
|
||||
} else {
|
||||
self.apply_batch_postgres(
|
||||
rel,
|
||||
blknum,
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
&records[batch_start..i],
|
||||
@@ -236,11 +214,10 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
}
|
||||
// last batch
|
||||
if batch_zenith {
|
||||
self.apply_batch_zenith(rel, blknum, lsn, img, &records[batch_start..])
|
||||
self.apply_batch_zenith(key, lsn, img, &records[batch_start..])
|
||||
} else {
|
||||
self.apply_batch_postgres(
|
||||
rel,
|
||||
blknum,
|
||||
key,
|
||||
lsn,
|
||||
img,
|
||||
&records[batch_start..],
|
||||
@@ -268,16 +245,15 @@ impl PostgresRedoManager {
|
||||
///
|
||||
fn apply_batch_postgres(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, ZenithWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let start_time = Instant::now();
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
@@ -291,16 +267,11 @@ impl PostgresRedoManager {
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
|
||||
let result = if let RelishTag::Relation(rel) = rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout);
|
||||
|
||||
apply_result.map_err(WalRedoError::IoError)
|
||||
} else {
|
||||
error!("unexpected non-relation relish: {:?}", rel);
|
||||
Err(WalRedoError::InvalidRequest)
|
||||
};
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
let result = process
|
||||
.apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
|
||||
.map_err(WalRedoError::IoError);
|
||||
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
@@ -326,8 +297,7 @@ impl PostgresRedoManager {
|
||||
///
|
||||
fn apply_batch_zenith(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, ZenithWalRecord)],
|
||||
@@ -346,7 +316,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// Apply all the WAL records in the batch
|
||||
for (record_lsn, record) in records.iter() {
|
||||
self.apply_record_zenith(rel, blknum, &mut page, *record_lsn, record)?;
|
||||
self.apply_record_zenith(key, &mut page, *record_lsn, record)?;
|
||||
}
|
||||
// Success!
|
||||
let end_time = Instant::now();
|
||||
@@ -365,8 +335,7 @@ impl PostgresRedoManager {
|
||||
|
||||
fn apply_record_zenith(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
key: Key,
|
||||
page: &mut BytesMut,
|
||||
_record_lsn: Lsn,
|
||||
record: &ZenithWalRecord,
|
||||
@@ -384,10 +353,11 @@ impl PostgresRedoManager {
|
||||
old_heap_blkno,
|
||||
flags,
|
||||
} => {
|
||||
// sanity check that this is modifying the correct relish
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert!(
|
||||
check_forknum(&rel, pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
"ClearVisibilityMapFlags record on unexpected rel {:?}",
|
||||
rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
if let Some(heap_blkno) = *new_heap_blkno {
|
||||
@@ -421,6 +391,14 @@ impl PostgresRedoManager {
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
ZenithWalRecord::ClogSetCommitted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetCommitted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -428,12 +406,17 @@ impl PostgresRedoManager {
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
|
||||
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetCommitted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
transaction_id_set_status(
|
||||
xid,
|
||||
@@ -443,6 +426,14 @@ impl PostgresRedoManager {
|
||||
}
|
||||
}
|
||||
ZenithWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::Clog,
|
||||
"ClogSetAborted record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for &xid in xids {
|
||||
let pageno = xid as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
@@ -450,17 +441,30 @@ impl PostgresRedoManager {
|
||||
|
||||
// Check that we're modifying the correct CLOG block.
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::Clog, expected_segno),
|
||||
"ClogSetCommitted record for XID {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"ClogSetAborted record for XID {} with unexpected key {}",
|
||||
xid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
transaction_id_set_status(xid, pg_constants::TRANSACTION_STATUS_ABORTED, page);
|
||||
}
|
||||
}
|
||||
ZenithWalRecord::MultixactOffsetCreate { mid, moff } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactOffsets,
|
||||
"MultixactOffsetCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
// Compute the block and offset to modify.
|
||||
// See RecordNewMultiXact in PostgreSQL sources.
|
||||
let pageno = mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
|
||||
@@ -471,16 +475,29 @@ impl PostgresRedoManager {
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::MultiXactOffsets, expected_segno),
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactOffsetsCreate record for multi-xid {} with unexpected key {}",
|
||||
mid,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
LittleEndian::write_u32(&mut page[offset..offset + 4], *moff);
|
||||
}
|
||||
ZenithWalRecord::MultixactMembersCreate { moff, members } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
slru_kind,
|
||||
SlruKind::MultiXactMembers,
|
||||
"MultixactMembersCreate record with unexpected key {}",
|
||||
key
|
||||
);
|
||||
for (i, member) in members.iter().enumerate() {
|
||||
let offset = moff + i as u32;
|
||||
|
||||
@@ -495,12 +512,17 @@ impl PostgresRedoManager {
|
||||
let expected_segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let expected_blknum = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
assert!(
|
||||
check_slru_segno(&rel, SlruKind::MultiXactMembers, expected_segno),
|
||||
"MultiXactMembersCreate record at offset {} with unexpected rel {:?}",
|
||||
segno == expected_segno,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
rel
|
||||
key
|
||||
);
|
||||
assert!(
|
||||
blknum == expected_blknum,
|
||||
"MultiXactMembersCreate record for offset {} with unexpected key {}",
|
||||
moff,
|
||||
key
|
||||
);
|
||||
assert!(blknum == expected_blknum);
|
||||
|
||||
let mut flagsval = LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
|
||||
|
||||
@@ -24,6 +24,9 @@ pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const XLOG_SMGR_CREATE: u8 = 0x10;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
|
||||
pub const SMGR_TRUNCATE_HEAP: u32 = 0x0001;
|
||||
pub const SMGR_TRUNCATE_VM: u32 = 0x0002;
|
||||
pub const SMGR_TRUNCATE_FSM: u32 = 0x0004;
|
||||
@@ -113,7 +116,6 @@ pub const XACT_XINFO_HAS_TWOPHASE: u32 = 1u32 << 4;
|
||||
// From pg_control.h and rmgrlist.h
|
||||
pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
|
||||
pub const XLOG_FPI: u8 = 0xB0;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
|
||||
@@ -1,130 +0,0 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
from fixtures.utils import print_gc_result
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old layer files
|
||||
#
|
||||
# This test is pretty tightly coupled with the current implementation of layered
|
||||
# storage, in layered_repository.rs.
|
||||
#
|
||||
def test_layerfiles_gc(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli.create_branch("test_layerfiles_gc", "empty")
|
||||
pg = env.postgres.create_start('test_layerfiles_gc')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
# Create a test table
|
||||
cur.execute("CREATE TABLE foo(x integer)")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
|
||||
cur.execute("select relfilenode from pg_class where oid = 'foo'::regclass")
|
||||
row = cur.fetchone()
|
||||
log.info(f"relfilenode is {row[0]}")
|
||||
|
||||
# Run GC, to clear out any garbage left behind in the catalogs by
|
||||
# the CREATE TABLE command. We want to have a clean slate with no garbage
|
||||
# before running the actual tests below, otherwise the counts won't match
|
||||
# what we expect.
|
||||
#
|
||||
# Also run vacuum first to make it less likely that autovacuum or pruning
|
||||
# kicks in and confuses our numbers.
|
||||
cur.execute("VACUUM")
|
||||
|
||||
# delete the row, to update the Visibility Map. We don't want the VM
|
||||
# update to confuse our numbers either.
|
||||
cur.execute("DELETE FROM foo")
|
||||
|
||||
log.info("Running GC before test")
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
# remember the number of files
|
||||
layer_relfiles_remain = (row['layer_relfiles_total'] -
|
||||
row['layer_relfiles_removed'])
|
||||
assert layer_relfiles_remain > 0
|
||||
|
||||
# Insert a row and run GC. Checkpoint should freeze the layer
|
||||
# so that there is only the most recent image layer left for the rel,
|
||||
# removing the old image and delta layer.
|
||||
log.info("Inserting one row and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (1)")
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Insert two more rows and run GC.
|
||||
# This should create new image and delta layer file with the new contents, and
|
||||
# then remove the old one image and the just-created delta layer.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Do it again. Should again create two new layer files and remove old ones.
|
||||
log.info("Inserting two more rows and running GC")
|
||||
cur.execute("INSERT INTO foo VALUES (2)")
|
||||
cur.execute("INSERT INTO foo VALUES (3)")
|
||||
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain + 2
|
||||
assert row['layer_relfiles_removed'] == 2
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
# Run GC again, with no changes in the database. Should not remove anything.
|
||||
log.info("Run GC again, with nothing to do")
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
assert row['layer_relfiles_total'] == layer_relfiles_remain
|
||||
assert row['layer_relfiles_removed'] == 0
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
|
||||
#
|
||||
# Test DROP TABLE checks that relation data and metadata was deleted by GC from object storage
|
||||
#
|
||||
log.info("Drop table and run GC again")
|
||||
cur.execute("DROP TABLE foo")
|
||||
|
||||
pscur.execute(f"do_gc {env.initial_tenant.hex} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
|
||||
# We still cannot remove the latest layers
|
||||
# because they serve as tombstones for earlier layers.
|
||||
assert row['layer_relfiles_dropped'] == 0
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
assert row['layer_relfiles_needed_as_tombstone'] == 3
|
||||
|
||||
# The catalog updates also create new layer files of the catalogs, which
|
||||
# are counted as 'removed'
|
||||
assert row['layer_relfiles_removed'] > 0
|
||||
|
||||
# TODO Change the test to check actual CG of dropped layers.
|
||||
# Each relation fork is counted separately, hence 3.
|
||||
#assert row['layer_relfiles_dropped'] == 3
|
||||
|
||||
# TODO: perhaps we should count catalog and user relations separately,
|
||||
# to make this kind of testing more robust
|
||||
@@ -74,8 +74,5 @@ def lsn_from_hex(lsn_hex: str) -> int:
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
|
||||
.format_map(row))
|
||||
log.info(
|
||||
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
|
||||
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_branches: {layers_needed_by_branches}, not_updated: {layers_not_updated}, removed: {layers_removed}"
|
||||
.format_map(row))
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: 093aa160e5...756a01aade
Reference in New Issue
Block a user