diff --git a/.github/actions/setup-greptimedb-cluster/action.yml b/.github/actions/setup-greptimedb-cluster/action.yml index d2f003073e..6e0bd2f09b 100644 --- a/.github/actions/setup-greptimedb-cluster/action.yml +++ b/.github/actions/setup-greptimedb-cluster/action.yml @@ -24,6 +24,8 @@ inputs: description: "Etcd endpoints" values-filename: default: "with-minio.yaml" + base-values-filename: + default: "with-observability.yaml" enable-region-failover: default: false @@ -49,6 +51,9 @@ runs: - name: Install GreptimeDB cluster shell: bash run: | + BASE_VALUES="./.github/actions/setup-greptimedb-cluster/${{ inputs.base-values-filename }}" + SCENARIO_VALUES="./.github/actions/setup-greptimedb-cluster/${{ inputs.values-filename }}" + helm upgrade \ --install my-greptimedb \ --set 'meta.backendStorage.etcd.endpoints[0]=${{ inputs.etcd-endpoints }}' \ @@ -66,7 +71,8 @@ runs: greptime/greptimedb-cluster \ --create-namespace \ -n my-greptimedb \ - --values ./.github/actions/setup-greptimedb-cluster/${{ inputs.values-filename }} \ + --values "${BASE_VALUES}" \ + --values "${SCENARIO_VALUES}" \ --wait \ --wait-for-jobs - name: Wait for GreptimeDB diff --git a/.github/actions/setup-greptimedb-cluster/with-observability.yaml b/.github/actions/setup-greptimedb-cluster/with-observability.yaml new file mode 100644 index 0000000000..581f60dea6 --- /dev/null +++ b/.github/actions/setup-greptimedb-cluster/with-observability.yaml @@ -0,0 +1,28 @@ +tracing: + enabled: true + endpoint: "http://my-greptimedb-monitor-standalone.my-greptimedb:4000/v1/otlp/v1/traces" + sampleRatio: "1.0" + +monitoring: + enabled: true + standalone: + base: + main: + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 256Mi + datanodeStorage: + fs: + storageSize: 5Gi + vector: + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "500m" + memory: "256Mi" diff --git a/.github/scripts/collect-fuzz-monitor-artifacts.sh b/.github/scripts/collect-fuzz-monitor-artifacts.sh new file mode 100755 index 0000000000..77f76537a3 --- /dev/null +++ b/.github/scripts/collect-fuzz-monitor-artifacts.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash + +set -euo pipefail + +GT_FUZZ_NS="${GT_FUZZ_NS:-my-greptimedb}" +GT_FUZZ_CLUSTER="${GT_FUZZ_CLUSTER:-my-greptimedb}" +GT_MONITOR_HTTP_LOCAL_PORT="${GT_MONITOR_HTTP_LOCAL_PORT:-14000}" +GT_MONITOR_ARTIFACT_DIR="${GT_MONITOR_ARTIFACT_DIR:-/tmp/fuzz-monitor-dumps}" +GT_MONITOR_SERVER_EXPORT_DIR="${GT_MONITOR_SERVER_EXPORT_DIR:-/tmp/gt-monitor-dump}" + +MONITOR_SERVICE="${GT_FUZZ_CLUSTER}-monitor-standalone" +MONITOR_POD="${GT_FUZZ_CLUSTER}-monitor-standalone-0" + +PORT_FORWARD_LOG="${GT_MONITOR_ARTIFACT_DIR}/port-forward.log" +SQL_LOG="${GT_MONITOR_ARTIFACT_DIR}/sql.log" +COPY_LOG="${GT_MONITOR_ARTIFACT_DIR}/copy.log" +STATE_LOG="${GT_MONITOR_ARTIFACT_DIR}/state.log" + +log() { + printf '[collect-fuzz-monitor-artifacts] %s\n' "$*" | tee -a "${STATE_LOG}" +} + +cleanup() { + if [ -n "${PORT_FORWARD_PID:-}" ]; then + kill "${PORT_FORWARD_PID}" >/dev/null 2>&1 || true + fi +} + +exec_sql() { + local db="$1" + local sql="$2" + local output + + log "execute sql on db=${db}: ${sql}" + + output="$(curl -sS -G "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/v1/sql" \ + --data-urlencode "db=${db}" \ + --data-urlencode "sql=${sql}")" + + printf '%s\n' "${output}" >>"${SQL_LOG}" + + if printf '%s' "${output}" | grep -q '"error"'; then + log "sql failed: ${output}" + return 1 + fi +} + +export_show_create_table() { + local table="$1" + local output_file="${GT_MONITOR_ARTIFACT_DIR}/${table}.show_create_table.sql" + local output + + log "export SHOW CREATE TABLE for ${table}" + output="$(curl -sS -G "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/v1/sql" \ + --data-urlencode "db=public" \ + --data-urlencode "sql=SHOW CREATE TABLE ${table};")" + + printf '%s\n' "${output}" >>"${SQL_LOG}" + + if printf '%s' "${output}" | grep -q '"error"'; then + log "show create table failed for ${table}: ${output}" + return 1 + fi + + printf '%s' "${output}" | python3 -c 'import json,sys; d=json.load(sys.stdin); print(d["output"][0]["records"]["rows"][0][1])' >"${output_file}" +} + +mkdir -p "${GT_MONITOR_ARTIFACT_DIR}" +rm -rf "${GT_MONITOR_ARTIFACT_DIR:?}/"* + +{ + echo "namespace=${GT_FUZZ_NS}" + echo "cluster=${GT_FUZZ_CLUSTER}" + echo "service=${MONITOR_SERVICE}" + echo "pod=${MONITOR_POD}" + echo "http_port=${GT_MONITOR_HTTP_LOCAL_PORT}" +} >"${STATE_LOG}" + +log "start port-forward service/${MONITOR_SERVICE} in namespace ${GT_FUZZ_NS}" +kubectl port-forward "service/${MONITOR_SERVICE}" "${GT_MONITOR_HTTP_LOCAL_PORT}:4000" -n "${GT_FUZZ_NS}" >"${PORT_FORWARD_LOG}" 2>&1 & +PORT_FORWARD_PID="$!" +trap cleanup EXIT +log "port-forward pid=${PORT_FORWARD_PID}" + +log "wait for port-forward bootstrap" +for i in {1..30}; do + if curl -s --fail "http://127.0.0.1:${GT_MONITOR_HTTP_LOCAL_PORT}/health" &> /dev/null; then + log "port-forward is ready" + break + fi + if [ "$i" -eq 30 ]; then + log "Timed out waiting for port-forward to be ready." + exit 1 + fi + sleep 1 +done + +log "ensure export dir exists in pod ${MONITOR_POD}" +kubectl exec -n "${GT_FUZZ_NS}" "${MONITOR_POD}" -- mkdir -p "${GT_MONITOR_SERVER_EXPORT_DIR}" >>"${COPY_LOG}" 2>&1 + +export_show_create_table "_gt_logs" +export_show_create_table "opentelemetry_traces" + +exec_sql "public" "COPY _gt_logs TO '${GT_MONITOR_SERVER_EXPORT_DIR}/_gt_logs.parquet' WITH (FORMAT='parquet');" +exec_sql "public" "COPY opentelemetry_traces TO '${GT_MONITOR_SERVER_EXPORT_DIR}/opentelemetry_traces.parquet' WITH (FORMAT='parquet');" + +log "copy _gt_logs.parquet from pod" +kubectl cp "${GT_FUZZ_NS}/${MONITOR_POD}:${GT_MONITOR_SERVER_EXPORT_DIR}/_gt_logs.parquet" "${GT_MONITOR_ARTIFACT_DIR}/_gt_logs.parquet" >>"${COPY_LOG}" 2>&1 +log "copy opentelemetry_traces.parquet from pod" +kubectl cp "${GT_FUZZ_NS}/${MONITOR_POD}:${GT_MONITOR_SERVER_EXPORT_DIR}/opentelemetry_traces.parquet" "${GT_MONITOR_ARTIFACT_DIR}/opentelemetry_traces.parquet" >>"${COPY_LOG}" 2>&1 + +ls -la "${GT_MONITOR_ARTIFACT_DIR}" >>"${STATE_LOG}" 2>&1 + +log "artifacts collected under ${GT_MONITOR_ARTIFACT_DIR}" diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 8b39acd99b..c258eae3b2 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -430,6 +430,24 @@ jobs: name: fuzz-tests-kind-logs-${{ matrix.mode.name }}-${{ matrix.target }} path: /tmp/kind retention-days: 3 + - name: Collect monitor dumps + if: failure() + shell: bash + env: + GT_FUZZ_NS: my-greptimedb + GT_FUZZ_CLUSTER: my-greptimedb + GT_MONITOR_HTTP_LOCAL_PORT: 14000 + GT_MONITOR_ARTIFACT_DIR: /tmp/fuzz-monitor-dumps + run: | + bash .github/scripts/collect-fuzz-monitor-artifacts.sh + - name: Upload monitor dumps + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-tests-monitor-dumps-${{ matrix.mode.name }}-${{ matrix.target }} + path: /tmp/fuzz-monitor-dumps + if-no-files-found: warn + retention-days: 3 - name: Delete cluster if: success() shell: bash @@ -584,6 +602,24 @@ jobs: name: fuzz-tests-kind-logs-${{ matrix.mode.name }}-${{ matrix.target }} path: /tmp/kind retention-days: 3 + - name: Collect monitor dumps + if: failure() + shell: bash + env: + GT_FUZZ_NS: my-greptimedb + GT_FUZZ_CLUSTER: my-greptimedb + GT_MONITOR_HTTP_LOCAL_PORT: 14000 + GT_MONITOR_ARTIFACT_DIR: /tmp/fuzz-monitor-dumps + run: | + bash .github/scripts/collect-fuzz-monitor-artifacts.sh + - name: Upload monitor dumps + if: failure() + uses: actions/upload-artifact@v4 + with: + name: fuzz-tests-monitor-dumps-${{ matrix.mode.name }}-${{ matrix.target }} + path: /tmp/fuzz-monitor-dumps + if-no-files-found: warn + retention-days: 3 - name: Delete cluster if: success() shell: bash