From 0f56104a6120876c387fcecb10b8f76dcef77504 Mon Sep 17 00:00:00 2001 From: Arthur Petukhovsky Date: Wed, 20 Dec 2023 19:06:55 +0400 Subject: [PATCH] Make sk_collect_dumps also possible with teleport (#4739) Co-authored-by: Arseny Sher --- scripts/sk_collect_dumps/.gitignore | 2 + scripts/sk_collect_dumps/ansible.cfg | 11 ++++++ scripts/sk_collect_dumps/pyproject.toml | 16 ++++++++ scripts/sk_collect_dumps/readme.md | 50 +++++++++++++++++-------- scripts/sk_collect_dumps/remote.yaml | 33 ++++++++++++---- scripts/sk_collect_dumps/ssh.cfg | 13 +++++++ scripts/sk_collect_dumps/upload.sh | 26 ++++++------- 7 files changed, 115 insertions(+), 36 deletions(-) create mode 100644 scripts/sk_collect_dumps/ansible.cfg create mode 100644 scripts/sk_collect_dumps/pyproject.toml create mode 100644 scripts/sk_collect_dumps/ssh.cfg diff --git a/scripts/sk_collect_dumps/.gitignore b/scripts/sk_collect_dumps/.gitignore index d9d4d0296a..cdf99aefd7 100644 --- a/scripts/sk_collect_dumps/.gitignore +++ b/scripts/sk_collect_dumps/.gitignore @@ -1,2 +1,4 @@ result *.json +hosts +poetry.lock diff --git a/scripts/sk_collect_dumps/ansible.cfg b/scripts/sk_collect_dumps/ansible.cfg new file mode 100644 index 0000000000..150986ab79 --- /dev/null +++ b/scripts/sk_collect_dumps/ansible.cfg @@ -0,0 +1,11 @@ +[defaults] +host_key_checking = False +inventory=./hosts +remote_tmp=/tmp +remote_user=developer +callbacks_enabled = profile_tasks + +[ssh_connection] +scp_if_ssh = True +ssh_args = -F ./ssh.cfg +pipelining = True diff --git a/scripts/sk_collect_dumps/pyproject.toml b/scripts/sk_collect_dumps/pyproject.toml new file mode 100644 index 0000000000..c6f6adafe2 --- /dev/null +++ b/scripts/sk_collect_dumps/pyproject.toml @@ -0,0 +1,16 @@ +[tool.poetry] +name = "sk-collect-dumps" +version = "0.1.0" +description = "" +authors = ["Arseny Sher "] +readme = "README.md" +packages = [{include = "sk_collect_dumps"}] + +[tool.poetry.dependencies] +python = "^3.11" +ansible = "^9.1.0" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/scripts/sk_collect_dumps/readme.md b/scripts/sk_collect_dumps/readme.md index 52b73e9495..7494a6cb78 100644 --- a/scripts/sk_collect_dumps/readme.md +++ b/scripts/sk_collect_dumps/readme.md @@ -1,25 +1,43 @@ # Collect /v1/debug_dump from all safekeeper nodes -1. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -2. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. - -## How to use ansible (staging) - +3. Issue admin token (add/remove .stage from url for staging/prod and setting proper API key): ``` -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +# staging: +AUTH_TOKEN=$(curl https://console.stage.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_STAGING_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# prod: +AUTH_TOKEN=$(curl https://console.neon.tech/regions/console/api/v1/admin/issue_token -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer $NEON_PROD_KEY" -X POST -d '{"ttl_seconds": 43200, "scope": "safekeeperdata"}' 2>/dev/null | jq --raw-output '.jwt') +# check +echo $AUTH_TOKEN +``` +2. Run ansible playbooks to collect .json dumps from all safekeepers and store them in `./result` directory. -AWS_DEFAULT_PROFILE=dev ansible-playbook -i ../../.github/ansible/staging.eu-west-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +There are two ways to do that, with ssm or tsh. ssm: +``` +# in aws repo, cd .github/ansible and run e.g. (adjusting profile and region in vars and limit): +AWS_DEFAULT_PROFILE=dev ansible-playbook -i inventory_aws_ec2.yaml -i staging.us-east-2.vars.yaml -e @ssm_config -l 'safekeeper:&us_east_2' -e "auth_token=${AUTH_TOKEN}" ~/neon/neon/scripts/sk_collect_dumps/remote.yaml +``` +It will put the results to .results directory *near the playbook*. + +tsh: + +Update the inventory, if needed, selecting .build/.tech and optionally region: +``` +rm -f hosts && echo '[safekeeper]' >> hosts +# staging: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.build" | grep us-east-2 >> hosts +# prod: +tsh ls | awk '{print $1}' | grep safekeeper | grep "neon.tech" | grep us-east-2 >> hosts ``` -## How to use ansible (prod) - +Test ansible connection: ``` -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-west-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.us-east-2.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.eu-central-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml - -AWS_DEFAULT_PROFILE=prod ansible-playbook -i ../../.github/ansible/prod.ap-southeast-1.hosts.yaml -e @../../.github/ansible/ssm_config remote.yaml +ansible all -m ping -v ``` +Download the dumps: +``` +mkdir -p result && rm -f result/* +ansible-playbook -e "auth_token=${AUTH_TOKEN}" remote.yaml +``` + +3. Run `DB_CONNSTR=... ./upload.sh prod_feb30` to upload dumps to `prod_feb30` table in specified postgres database. diff --git a/scripts/sk_collect_dumps/remote.yaml b/scripts/sk_collect_dumps/remote.yaml index 29ce83efde..f214d0ae2c 100644 --- a/scripts/sk_collect_dumps/remote.yaml +++ b/scripts/sk_collect_dumps/remote.yaml @@ -1,18 +1,37 @@ - name: Fetch state dumps from safekeepers - hosts: safekeepers + hosts: safekeeper gather_facts: False - remote_user: "{{ remote_user }}" tasks: - - name: Download file + - name: Dump file get_url: url: "http://{{ inventory_hostname }}:7676/v1/debug_dump?dump_all=true&dump_disk_content=false" - dest: "/tmp/{{ inventory_hostname }}.json" + dest: "/tmp/{{ inventory_hostname }}-dump.json" + headers: + Authorization: "Bearer {{ auth_token }}" - - name: Fetch file from remote hosts + - name: install rsync + ansible.builtin.apt: + name: rsync + update_cache: yes + become: yes + ignore_errors: true # it can be already installed and we don't always have sudo + + - name: Fetch file from remote hosts (works only with ssm) fetch: - src: "/tmp/{{ inventory_hostname }}.json" - dest: "./result/{{ inventory_hostname }}.json" + src: "/tmp/{{ inventory_hostname }}-dump.json" + dest: "./result/{{ inventory_hostname }}-dump.json" flat: yes fail_on_missing: no + when: ansible_connection == "aws_ssm" + # xxx not sure how to make ansible 'synchronize' work with tsh + - name: Fetch file from remote hosts + shell: rsync -e 'tsh ssh' -azvP "developer@{{ inventory_hostname }}:/tmp/{{ inventory_hostname }}-dump.json" "./result/{{ inventory_hostname }}-dump.json" + delegate_to: localhost + when: ansible_connection != "aws_ssm" + + - name: remove remote dumps + ansible.builtin.file: + path: "/tmp/{{ inventory_hostname }}-dump.json" + state: absent diff --git a/scripts/sk_collect_dumps/ssh.cfg b/scripts/sk_collect_dumps/ssh.cfg new file mode 100644 index 0000000000..827c5d9286 --- /dev/null +++ b/scripts/sk_collect_dumps/ssh.cfg @@ -0,0 +1,13 @@ +# Begin generated Teleport configuration for teleport.aws.neon.tech by tsh + +# Common flags for all teleport.aws.neon.tech hosts +Host * + HostKeyAlgorithms rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-rsa-cert-v01@openssh.com + +# Flags for all teleport.aws.neon.tech hosts except the proxy +Host * !teleport.aws.neon.tech + Port 3022 + ProxyCommand "/usr/local/bin/tsh" proxy ssh --cluster=teleport.aws.neon.tech --proxy=teleport.aws.neon.tech:443 %r@%h:%p + User developer + +# End generated Teleport configuration \ No newline at end of file diff --git a/scripts/sk_collect_dumps/upload.sh b/scripts/sk_collect_dumps/upload.sh index 2e54ecba1c..5189883fcb 100755 --- a/scripts/sk_collect_dumps/upload.sh +++ b/scripts/sk_collect_dumps/upload.sh @@ -31,22 +31,22 @@ SELECT (data->>'tenant_id') AS tenant_id, (data->>'timeline_id') AS timeline_id, (data->'memory'->>'active')::bool AS active, - (data->'memory'->>'flush_lsn')::bigint AS flush_lsn, - (data->'memory'->'mem_state'->>'backup_lsn')::bigint AS backup_lsn, - (data->'memory'->'mem_state'->>'commit_lsn')::bigint AS commit_lsn, - (data->'memory'->'mem_state'->>'peer_horizon_lsn')::bigint AS peer_horizon_lsn, - (data->'memory'->'mem_state'->>'remote_consistent_lsn')::bigint AS remote_consistent_lsn, - (data->'memory'->>'write_lsn')::bigint AS write_lsn, + (data->'memory'->>'flush_lsn')::pg_lsn AS flush_lsn, + (data->'memory'->'mem_state'->>'backup_lsn')::pg_lsn AS backup_lsn, + (data->'memory'->'mem_state'->>'commit_lsn')::pg_lsn AS commit_lsn, + (data->'memory'->'mem_state'->>'peer_horizon_lsn')::pg_lsn AS peer_horizon_lsn, + (data->'memory'->'mem_state'->>'remote_consistent_lsn')::pg_lsn AS remote_consistent_lsn, + (data->'memory'->>'write_lsn')::pg_lsn AS write_lsn, (data->'memory'->>'num_computes')::bigint AS num_computes, - (data->'memory'->>'epoch_start_lsn')::bigint AS epoch_start_lsn, + (data->'memory'->>'epoch_start_lsn')::pg_lsn AS epoch_start_lsn, (data->'memory'->>'last_removed_segno')::bigint AS last_removed_segno, (data->'memory'->>'is_cancelled')::bool AS is_cancelled, - (data->'control_file'->>'backup_lsn')::bigint AS disk_backup_lsn, - (data->'control_file'->>'commit_lsn')::bigint AS disk_commit_lsn, + (data->'control_file'->>'backup_lsn')::pg_lsn AS disk_backup_lsn, + (data->'control_file'->>'commit_lsn')::pg_lsn AS disk_commit_lsn, (data->'control_file'->'acceptor_state'->>'term')::bigint AS disk_term, - (data->'control_file'->>'local_start_lsn')::bigint AS local_start_lsn, - (data->'control_file'->>'peer_horizon_lsn')::bigint AS disk_peer_horizon_lsn, - (data->'control_file'->>'timeline_start_lsn')::bigint AS timeline_start_lsn, - (data->'control_file'->>'remote_consistent_lsn')::bigint AS disk_remote_consistent_lsn + (data->'control_file'->>'local_start_lsn')::pg_lsn AS local_start_lsn, + (data->'control_file'->>'peer_horizon_lsn')::pg_lsn AS disk_peer_horizon_lsn, + (data->'control_file'->>'timeline_start_lsn')::pg_lsn AS timeline_start_lsn, + (data->'control_file'->>'remote_consistent_lsn')::pg_lsn AS disk_remote_consistent_lsn FROM tmp_json EOF