mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 14:00:38 +00:00
Add script for safekeeper tenants cleanup (#3452)
This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in the console). To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers. Fixes https://github.com/neondatabase/cloud/issues/3356
This commit is contained in:
committed by
GitHub
parent
f07d6433b6
commit
7ed9eb4a56
55
scripts/sk_cleanup_tenants/readme.md
Normal file
55
scripts/sk_cleanup_tenants/readme.md
Normal file
@@ -0,0 +1,55 @@
|
||||
# Cleanup script for safekeeper
|
||||
|
||||
This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in console).
|
||||
|
||||
To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers.
|
||||
|
||||
NOTE: Console queries to check that project is deleted are slow and inefficient.
|
||||
If you want to run this script on safekeeper with many tenants, consider
|
||||
making PR to console repo to make projects search by tenant_id faster.
|
||||
|
||||
## How to run on a single node
|
||||
|
||||
```
|
||||
zsh nsh safekeeper-0.us-east-2.aws.neon.build
|
||||
|
||||
ls /storage/safekeeper/data/ | grep -v safekeeper > tenants.txt
|
||||
|
||||
mkdir -p /storage/neon-trash/2023-01-01--cleanup
|
||||
|
||||
export CONSOLE_API_TOKEN=
|
||||
python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
|
||||
|
||||
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
|
||||
|
||||
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& tee logs.txt
|
||||
```
|
||||
|
||||
## How to use ansible (staging)
|
||||
|
||||
```
|
||||
cd ~/neon/.github/ansible
|
||||
|
||||
export AWS_DEFAULT_PROFILE=dev
|
||||
|
||||
ansible-playbook -i staging.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
|
||||
|
||||
# add --extra-vars "api_token=" to set console api token
|
||||
```
|
||||
|
||||
## How to use ansible (prod)
|
||||
|
||||
- Change `endpoint` in `script.py` to "https://console.neon.tech/api"
|
||||
|
||||
```
|
||||
cd ~/neon/.github/ansible
|
||||
|
||||
export AWS_DEFAULT_PROFILE=prod
|
||||
|
||||
ansible-playbook -i prod.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
|
||||
|
||||
# add --extra-vars "api_token=" to set console api token
|
||||
```
|
||||
|
||||
|
||||
> Heavily inspired with script for pageserver cleanup: https://gist.github.com/problame/bafb6ca6334f0145757238e61380c3f1/9bef1845a8291ebfa1f3a51eb79c01d12498b2b5
|
||||
80
scripts/sk_cleanup_tenants/remote.yaml
Normal file
80
scripts/sk_cleanup_tenants/remote.yaml
Normal file
@@ -0,0 +1,80 @@
|
||||
- name: Test safekeepers
|
||||
hosts: safekeepers
|
||||
gather_facts: False
|
||||
remote_user: "{{ remote_user }}"
|
||||
|
||||
vars:
|
||||
script_dir: /storage/ansible_sk_cleanup
|
||||
tenants_file: "{{ script_dir }}/tenants.txt"
|
||||
trash_dir: /storage/neon-trash/2023-01-01--changeme
|
||||
|
||||
tasks:
|
||||
|
||||
- name: create script directory
|
||||
file:
|
||||
path: "{{ script_dir }}"
|
||||
state: directory
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: create trash dir
|
||||
file:
|
||||
path: "{{ trash_dir }}"
|
||||
state: directory
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: collect all tenant_ids to tenants.txt
|
||||
shell:
|
||||
cmd: ls /storage/safekeeper/data/ | grep -v safekeeper > {{ tenants_file }}
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: count tenants
|
||||
shell:
|
||||
cmd: wc -l {{ tenants_file }}
|
||||
register: tenants_count
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- debug: msg="{{ tenants_count.stdout }}"
|
||||
|
||||
- name: fetch safekeeper_id
|
||||
shell:
|
||||
cmd: cat /storage/safekeeper/data/safekeeper.id
|
||||
register: safekeeper_id
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- debug: msg="{{ safekeeper_id.stdout }}"
|
||||
|
||||
- name: copy script.py to safekeeper
|
||||
copy:
|
||||
src: script.py
|
||||
dest: "{{ script_dir }}"
|
||||
mode: 0755
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
- name: Run an async task
|
||||
shell:
|
||||
chdir: "{{ script_dir }}"
|
||||
cmd: "cat tenants.txt | python3 script.py --trash-dir {{ trash_dir }} --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& cat > {{ script_dir }}/run-`date +%Y-%m-%d-%H.%M.%S`.log"
|
||||
args:
|
||||
executable: /bin/bash
|
||||
environment:
|
||||
CONSOLE_API_TOKEN: "{{ api_token }}"
|
||||
async: 30000
|
||||
poll: 0
|
||||
register: bg_async_task
|
||||
|
||||
- name: Check on an async task
|
||||
async_status:
|
||||
jid: "{{ bg_async_task.ansible_job_id }}"
|
||||
become: true
|
||||
register: job_result
|
||||
until: job_result.finished
|
||||
retries: 3000
|
||||
delay: 10
|
||||
133
scripts/sk_cleanup_tenants/script.py
Normal file
133
scripts/sk_cleanup_tenants/script.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
|
||||
datefmt="%Y-%m-%d:%H:%M:%S",
|
||||
level=level,
|
||||
)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--trash-dir", required=True, type=Path)
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
parser.add_argument("--safekeeper-id", required=True, type=int)
|
||||
parser.add_argument("--safekeeper-host", required=True, type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
access_key = os.getenv("CONSOLE_API_TOKEN")
|
||||
endpoint: str = "https://console.stage.neon.tech/api"
|
||||
|
||||
trash_dir: Path = args.trash_dir
|
||||
dry_run: bool = args.dry_run
|
||||
logging.info(f"dry_run={dry_run}")
|
||||
sk_id: int = args.safekeeper_id
|
||||
sk_host: str = args.safekeeper_host
|
||||
|
||||
assert trash_dir.is_dir()
|
||||
|
||||
###
|
||||
|
||||
|
||||
def console_get(rel_url):
|
||||
r = requests.get(
|
||||
f"{endpoint}{rel_url}",
|
||||
headers={
|
||||
"Authorization": f"Bearer {access_key}",
|
||||
"Content-Type": "application/json",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
|
||||
def tenant_is_deleted_in_console(tenant_id):
|
||||
r = console_get(f"/v1/admin/projects?search={tenant_id}&show_deleted=true")
|
||||
r = r.json()
|
||||
results = r["data"]
|
||||
assert len(results) == 1, f"unexpected results len: {results}"
|
||||
r = results[0]
|
||||
assert r["tenant"] == tenant_id, f"tenant id doesn't match: {r}"
|
||||
assert r["safekeepers"] is not None, f"safekeepers is None: {r}"
|
||||
assert any(sk["id"] == sk_id for sk in r["safekeepers"]), f"safekeeper id not found: {r}"
|
||||
assert "deleted" in r, f"{r}"
|
||||
return r["deleted"] is True
|
||||
|
||||
|
||||
def call_delete_tenant_api(tenant_id):
|
||||
r = requests.delete(f"http://{sk_host}:7676/v1/tenant/{tenant_id}")
|
||||
r.raise_for_status()
|
||||
return r
|
||||
|
||||
|
||||
def cleanup_tenant(tenant_id):
|
||||
|
||||
tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}")
|
||||
|
||||
if not tenant_dir.exists():
|
||||
logging.info("tenant directory doesn't exist, assuming it has been cleaned already")
|
||||
return
|
||||
|
||||
if not tenant_is_deleted_in_console(tenant_id):
|
||||
logging.info("tenant is not deleted in console, skipping")
|
||||
return
|
||||
|
||||
logging.info("assertions passed")
|
||||
|
||||
if dry_run:
|
||||
return
|
||||
|
||||
logging.info("deleting tenant")
|
||||
|
||||
tenant_dir_in_trash = trash_dir / tenant_dir.relative_to("/")
|
||||
tenant_dir_in_trash.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
assert not tenant_dir_in_trash.exists(), f"{tenant_dir_in_trash}"
|
||||
assert tenant_dir_in_trash.parent.exists(), f"{tenant_dir_in_trash}"
|
||||
# double-check
|
||||
assert tenant_dir.exists(), f"{tenant_dir}"
|
||||
assert tenant_dir.is_dir(), f"{tenant_dir}"
|
||||
|
||||
logging.info(f"copying {tenant_dir} to {tenant_dir_in_trash}")
|
||||
shutil.copytree(src=tenant_dir, dst=tenant_dir_in_trash, symlinks=False, dirs_exist_ok=False)
|
||||
|
||||
logging.info(f"deleting {tenant_dir}")
|
||||
call_delete_tenant_api(tenant_id)
|
||||
|
||||
logging.info("tenant is now deleted, checking that it's gone")
|
||||
assert not tenant_dir.exists(), f"{tenant_dir}"
|
||||
|
||||
|
||||
if os.path.exists("script.pid"):
|
||||
logging.info(
|
||||
f"script is already running, with pid={Path('script.pid').read_text()}. Terminate it first."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
with open("script.pid", "w", encoding="utf-8") as f:
|
||||
f.write(str(os.getpid()))
|
||||
|
||||
logging.info(f"started script.py, pid={os.getpid()}")
|
||||
|
||||
for line in sys.stdin:
|
||||
tenant_id = line.strip()
|
||||
try:
|
||||
logging.info(f"start tenant {tenant_id}")
|
||||
cleanup_tenant(tenant_id)
|
||||
logging.info(f"done tenant {tenant_id}")
|
||||
except KeyboardInterrupt:
|
||||
print("KeyboardInterrupt exception is caught")
|
||||
break
|
||||
except: # noqa: E722
|
||||
logging.exception(f"failed to clean up tenant {tenant_id}")
|
||||
|
||||
logging.info(f"finished script.py, pid={os.getpid()}")
|
||||
|
||||
os.remove("script.pid")
|
||||
Reference in New Issue
Block a user