Add script for safekeeper tenants cleanup (#3452)

This script can be used to remove tenant directories on safekeepers for
projects which do not longer exist (deleted in the console).

To run this script you need to upload it to safekeeper (i.e. with SSH),
and run it with python3. Ansible can be used to run this script on
multiple safekeepers.

Fixes https://github.com/neondatabase/cloud/issues/3356
This commit is contained in:
Arthur Petukhovsky
2023-02-09 14:28:20 +03:00
committed by GitHub
parent f07d6433b6
commit 7ed9eb4a56
3 changed files with 268 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
# Cleanup script for safekeeper
This script can be used to remove tenant directories on safekeepers for projects which do not longer exist (deleted in console).
To run this script you need to upload it to safekeeper (i.e. with SSH), and run it with python3. Ansible can be used to run this script on multiple safekeepers.
NOTE: Console queries to check that project is deleted are slow and inefficient.
If you want to run this script on safekeeper with many tenants, consider
making PR to console repo to make projects search by tenant_id faster.
## How to run on a single node
```
zsh nsh safekeeper-0.us-east-2.aws.neon.build
ls /storage/safekeeper/data/ | grep -v safekeeper > tenants.txt
mkdir -p /storage/neon-trash/2023-01-01--cleanup
export CONSOLE_API_TOKEN=
python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME --dry-run
cat tenants.txt | python3 script.py --trash-dir /storage/neon-trash/2023-01-01--cleanup --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& tee logs.txt
```
## How to use ansible (staging)
```
cd ~/neon/.github/ansible
export AWS_DEFAULT_PROFILE=dev
ansible-playbook -i staging.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
# add --extra-vars "api_token=" to set console api token
```
## How to use ansible (prod)
- Change `endpoint` in `script.py` to "https://console.neon.tech/api"
```
cd ~/neon/.github/ansible
export AWS_DEFAULT_PROFILE=prod
ansible-playbook -i prod.us-east-2.hosts.yaml -e @ssm_config ../../scripts/sk_cleanup_tenants/remote.yaml
# add --extra-vars "api_token=" to set console api token
```
> Heavily inspired with script for pageserver cleanup: https://gist.github.com/problame/bafb6ca6334f0145757238e61380c3f1/9bef1845a8291ebfa1f3a51eb79c01d12498b2b5

View File

@@ -0,0 +1,80 @@
- name: Test safekeepers
hosts: safekeepers
gather_facts: False
remote_user: "{{ remote_user }}"
vars:
script_dir: /storage/ansible_sk_cleanup
tenants_file: "{{ script_dir }}/tenants.txt"
trash_dir: /storage/neon-trash/2023-01-01--changeme
tasks:
- name: create script directory
file:
path: "{{ script_dir }}"
state: directory
mode: 0755
tags:
- safekeeper
- name: create trash dir
file:
path: "{{ trash_dir }}"
state: directory
mode: 0755
tags:
- safekeeper
- name: collect all tenant_ids to tenants.txt
shell:
cmd: ls /storage/safekeeper/data/ | grep -v safekeeper > {{ tenants_file }}
tags:
- safekeeper
- name: count tenants
shell:
cmd: wc -l {{ tenants_file }}
register: tenants_count
tags:
- safekeeper
- debug: msg="{{ tenants_count.stdout }}"
- name: fetch safekeeper_id
shell:
cmd: cat /storage/safekeeper/data/safekeeper.id
register: safekeeper_id
tags:
- safekeeper
- debug: msg="{{ safekeeper_id.stdout }}"
- name: copy script.py to safekeeper
copy:
src: script.py
dest: "{{ script_dir }}"
mode: 0755
tags:
- safekeeper
- name: Run an async task
shell:
chdir: "{{ script_dir }}"
cmd: "cat tenants.txt | python3 script.py --trash-dir {{ trash_dir }} --safekeeper-id $(cat /storage/safekeeper/data/safekeeper.id) --safekeeper-host $HOSTNAME |& cat > {{ script_dir }}/run-`date +%Y-%m-%d-%H.%M.%S`.log"
args:
executable: /bin/bash
environment:
CONSOLE_API_TOKEN: "{{ api_token }}"
async: 30000
poll: 0
register: bg_async_task
- name: Check on an async task
async_status:
jid: "{{ bg_async_task.ansible_job_id }}"
become: true
register: job_result
until: job_result.finished
retries: 3000
delay: 10

View File

@@ -0,0 +1,133 @@
import argparse
import logging
import os
import shutil
import sys
from pathlib import Path
import requests
level = logging.INFO
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=level,
)
parser = argparse.ArgumentParser()
parser.add_argument("--trash-dir", required=True, type=Path)
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--safekeeper-id", required=True, type=int)
parser.add_argument("--safekeeper-host", required=True, type=str)
args = parser.parse_args()
access_key = os.getenv("CONSOLE_API_TOKEN")
endpoint: str = "https://console.stage.neon.tech/api"
trash_dir: Path = args.trash_dir
dry_run: bool = args.dry_run
logging.info(f"dry_run={dry_run}")
sk_id: int = args.safekeeper_id
sk_host: str = args.safekeeper_host
assert trash_dir.is_dir()
###
def console_get(rel_url):
r = requests.get(
f"{endpoint}{rel_url}",
headers={
"Authorization": f"Bearer {access_key}",
"Content-Type": "application/json",
"Accept": "application/json",
},
)
r.raise_for_status()
return r
def tenant_is_deleted_in_console(tenant_id):
r = console_get(f"/v1/admin/projects?search={tenant_id}&show_deleted=true")
r = r.json()
results = r["data"]
assert len(results) == 1, f"unexpected results len: {results}"
r = results[0]
assert r["tenant"] == tenant_id, f"tenant id doesn't match: {r}"
assert r["safekeepers"] is not None, f"safekeepers is None: {r}"
assert any(sk["id"] == sk_id for sk in r["safekeepers"]), f"safekeeper id not found: {r}"
assert "deleted" in r, f"{r}"
return r["deleted"] is True
def call_delete_tenant_api(tenant_id):
r = requests.delete(f"http://{sk_host}:7676/v1/tenant/{tenant_id}")
r.raise_for_status()
return r
def cleanup_tenant(tenant_id):
tenant_dir = Path(f"/storage/safekeeper/data/{tenant_id}")
if not tenant_dir.exists():
logging.info("tenant directory doesn't exist, assuming it has been cleaned already")
return
if not tenant_is_deleted_in_console(tenant_id):
logging.info("tenant is not deleted in console, skipping")
return
logging.info("assertions passed")
if dry_run:
return
logging.info("deleting tenant")
tenant_dir_in_trash = trash_dir / tenant_dir.relative_to("/")
tenant_dir_in_trash.parent.mkdir(parents=True, exist_ok=True)
assert not tenant_dir_in_trash.exists(), f"{tenant_dir_in_trash}"
assert tenant_dir_in_trash.parent.exists(), f"{tenant_dir_in_trash}"
# double-check
assert tenant_dir.exists(), f"{tenant_dir}"
assert tenant_dir.is_dir(), f"{tenant_dir}"
logging.info(f"copying {tenant_dir} to {tenant_dir_in_trash}")
shutil.copytree(src=tenant_dir, dst=tenant_dir_in_trash, symlinks=False, dirs_exist_ok=False)
logging.info(f"deleting {tenant_dir}")
call_delete_tenant_api(tenant_id)
logging.info("tenant is now deleted, checking that it's gone")
assert not tenant_dir.exists(), f"{tenant_dir}"
if os.path.exists("script.pid"):
logging.info(
f"script is already running, with pid={Path('script.pid').read_text()}. Terminate it first."
)
exit(1)
with open("script.pid", "w", encoding="utf-8") as f:
f.write(str(os.getpid()))
logging.info(f"started script.py, pid={os.getpid()}")
for line in sys.stdin:
tenant_id = line.strip()
try:
logging.info(f"start tenant {tenant_id}")
cleanup_tenant(tenant_id)
logging.info(f"done tenant {tenant_id}")
except KeyboardInterrupt:
print("KeyboardInterrupt exception is caught")
break
except: # noqa: E722
logging.exception(f"failed to clean up tenant {tenant_id}")
logging.info(f"finished script.py, pid={os.getpid()}")
os.remove("script.pid")