diff --git a/scripts/server-health-report.sh b/scripts/server-health-report.sh index df674d3..c00981d 100755 --- a/scripts/server-health-report.sh +++ b/scripts/server-health-report.sh @@ -1,23 +1,43 @@ #!/usr/bin/env bash set -Eeuo pipefail +# --- 配置区 --- CHAT_ID="-1003834524994" TOPIC_ID="4" BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}" -MAX_LEN=3800 +TOP_N="${TOP_N:-5}" +# -------------- have() { command -v "$1" >/dev/null 2>&1; } -human_uptime() { - uptime -p 2>/dev/null || true +escape_html() { + sed -e 's/&/\&/g' -e 's//\>/g' } -load_avg() { - awk '{print $1, $2, $3}' /proc/loadavg 2>/dev/null || echo "N/A" +trim_text() { + local max="$1" + local text="$2" + if (( ${#text} > max )); then + printf '%s…' "${text:0:max-1}" + else + printf '%s' "$text" + fi } -cpu_usage_percent() { - local a b idle_a total_a idle_b total_b usage +draw_bar() { + local percent="$1" + local width=10 + local filled=$(( percent * width / 100 )) + (( filled > width )) && filled=$width + local empty=$(( width - filled )) + local bar="" + for ((i=0; i/dev/null | paste -sd ', ' - | sed 's/^/核心: /' || echo "核心: N/A" +get_mem_info() { + free -m | awk '/^Mem:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}' } -mem_summary() { - free -m 2>/dev/null | awk ' - /^Mem:/ {printf "内存: %d/%d MB (%.0f%%)\n", $3, $2, ($3*100/$2)} - /^Swap:/ {if ($2==0) printf "Swap: 0/0 MB (未启用)\n"; else printf "Swap: %d/%d MB (%.0f%%)\n", $3, $2, ($3*100/$2)} - ' +get_swap_info() { + free -m | awk '/^Swap:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}' } -disk_summary() { - df -h / 2>/dev/null | awk 'NR==2 {printf "磁盘 /: %s/%s (%s), 可用 %s\n", $3, $2, $5, $4}' - df -ih / 2>/dev/null | awk 'NR==2 {printf "Inode /: %s/%s (%s), 可用 %s\n", $3, $2, $5, $4}' +get_root_disk_info() { + df -h / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}' } -hostname_line() { - printf "主机: %s\n" "$(hostname 2>/dev/null || echo unknown)" +get_root_inode_info() { + df -i / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}' } -date_line() { - printf "时间: %s\n" "$(date '+%F %T %Z' 2>/dev/null || echo unknown)" +get_loadavg() { + awk '{print $1" / "$2" / "$3}' /proc/loadavg } -top_cpu() { - ps -eo pid,user,%cpu,%mem,comm --sort=-%cpu 2>/dev/null | head -n 11 | tail -n +2 | awk '{printf "- PID=%s USER=%s CPU=%s%% MEM=%s%% CMD=%s\n", $1,$2,$3,$4,$5}' +get_uptime() { + uptime -p | sed 's/^up //' } -top_mem() { - ps -eo pid,user,%cpu,%mem,comm --sort=-%mem 2>/dev/null | head -n 11 | tail -n +2 | awk '{printf "- PID=%s USER=%s CPU=%s%% MEM=%s%% CMD=%s\n", $1,$2,$3,$4,$5}' -} - -temps_block() { - local out="" +get_temp() { if have sensors; then - out=$(sensors 2>/dev/null | sed '/^$/d' | head -n 20 || true) - fi - if [[ -z "$out" ]]; then - out=$(for z in /sys/class/thermal/thermal_zone*; do - [[ -r "$z/temp" ]] || continue - t=$(cat "$z/type" 2>/dev/null || echo "unknown") - v=$(cat "$z/temp" 2>/dev/null || true) - [[ -n "$v" ]] || continue - awk -v type="$t" -v raw="$v" 'BEGIN { printf "- %s: %.1f°C\n", type, raw/1000 }' - done 2>/dev/null | head -n 10) - fi - if [[ -n "$out" ]]; then - printf "%s\n" "$out" + sensors 2>/dev/null | awk '/Package id 0:|Tctl:|Tdie:|Composite:/ {gsub(/^\+/,"",$2); print $1" "$2; count++; if (count>=3) exit}' | paste -sd '; ' - + elif [[ -r /sys/class/thermal/thermal_zone0/temp ]]; then + awk '{printf "thermal_zone0 %.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp else - echo "- 未获取到温度/传感器数据" + printf 'N/A' fi } -fans_block() { - local found=0 - for f in /sys/class/hwmon/hwmon*; do - [[ -d "$f" ]] || continue - local name - name=$(cat "$f/name" 2>/dev/null || echo "hwmon") - for i in "$f"/fan*_input; do - [[ -r "$i" ]] || continue - found=1 - printf -- "- %s %s=%s RPM\n" "$name" "$(basename "$i" .*)" "$(cat "$i" 2>/dev/null)" - done +get_top_partitions() { + df -hP -x tmpfs -x devtmpfs | awk 'NR==1 {next} {gsub(/%/,"",$5); print $5"\t"$6"\t"$3"/"$2}' | sort -rn | head -n 5 | while IFS=$'\t' read -r usep mount usedtotal; do + printf '%s%% %s (%s)\n' "$usep" "$mount" "$usedtotal" done - [[ $found -eq 1 ]] || echo "- 未获取到风扇转速数据" } -risk_block() { - local cpu mem load suspicious="无明显异常" - cpu=$(cpu_usage_percent) - mem=$(free -m 2>/dev/null | awk '/^Mem:/ {printf "%.0f", ($3*100/$2)}' || echo 0) - load=$(awk '{print $1}' /proc/loadavg 2>/dev/null || echo 0) +get_top_cpu_processes() { + ps -eo pid,user,%cpu,%mem,etime,args --sort=-%cpu | awk 'NR>1 && NR<='"$((5+1))"' { + cmd=""; + for (i=6; i<=NF; i++) cmd=cmd $i (i=85)}'; then suspicious="CPU 占用偏高,需关注高占用进程"; fi - if awk -v v="$mem" 'BEGIN{exit !(v>=85)}'; then suspicious="内存占用偏高,需关注内存泄漏或异常进程"; fi +get_top_mem_processes() { + ps -eo pid,user,%cpu,%mem,etime,args --sort=-%mem | awk 'NR>1 && NR<='"$((5+1))"' { + cmd=""; + for (i=6; i<=NF; i++) cmd=cmd $i (i= 85 )) && alerts+=("CPU ${cpu_p}% 偏高") + (( mem_p >= 85 )) && alerts+=("内存 ${mem_p}% 偏高") + (( disk_p >= 90 )) && alerts+=("根分区 ${disk_p}% 偏高") + (( swap_p >= 30 )) && alerts+=("Swap ${swap_p}% 已使用") + awk -v l1="$load1" -v c="$cores" 'BEGIN {exit !(l1>c)}' && alerts+=("1 分钟负载 ${load1} 高于核心数 ${cores}") + + if (( ${#alerts[@]} == 0 )); then + printf '✅ 未见明显异常' + else + printf '⚠️ %s' "$(IFS=';'; echo "${alerts[*]}")" + fi } build_report() { + local cpu_p mem_used mem_total mem_p swap_used swap_total swap_p + local disk_used disk_total disk_p inode_used inode_total inode_p + local host up load temp status alert_summary cores load1 + local top_cpu top_mem top_parts + + cpu_p=$(get_cpu_usage) + read -r mem_used mem_total mem_p < <(get_mem_info) + read -r swap_used swap_total swap_p < <(get_swap_info) + read -r disk_used disk_total disk_p < <(get_root_disk_info) + read -r inode_used inode_total inode_p < <(get_root_inode_info) + load=$(get_loadavg) + load1=$(awk '{print $1}' /proc/loadavg) + up=$(get_uptime) + host=$(hostname) + temp=$(get_temp) + cores=$(nproc) + top_cpu=$(format_proc_lines "$(get_top_cpu_processes)") + top_mem=$(format_proc_lines "$(get_top_mem_processes)") + top_parts=$(get_top_partitions) + alert_summary=$(build_alerts "$cpu_p" "$mem_p" "$disk_p" "$swap_p" "$load1" "$cores") + + if [[ "$alert_summary" == ✅* ]]; then + status='🟢 正常' + elif (( cpu_p >= 95 || mem_p >= 95 || disk_p >= 95 )); then + status='🔴 告警' + else + status='🟡 关注' + fi + + temp=$(printf '%s' "$temp" | escape_html) + top_cpu=$(printf '%s' "$top_cpu" | escape_html) + top_mem=$(printf '%s' "$top_mem" | escape_html) + top_parts=$(printf '%s' "$top_parts" | escape_html) + alert_summary=$(printf '%s' "$alert_summary" | escape_html) + cat <🖥 服务器巡检 · ${host} +状态:${status} +${alert_summary} +━━━━━━━━━━━━━━━━━━ +核心指标 +CPU $(draw_bar "$cpu_p") ${cpu_p}% +RAM $(draw_bar "$mem_p") ${mem_p}% +DSK $(draw_bar "$disk_p") ${disk_p}% + +概览 +• 运行时间:${up} +• 负载(1/5/15):${load} +• 内存:${mem_used}MB / ${mem_total}MB +• Swap:${swap_used}MB / ${swap_total}MB (${swap_p}%) +• 根分区:${disk_used} / ${disk_total} +• Inode:${inode_used} / ${inode_total} (${inode_p}%) +• 温度:${temp} + +最忙分区 Top 5 +
${top_parts}
+CPU Top ${TOP_N} +
PID    USER      CPU   MEM  ELAPSED    CMD
+${top_cpu}
+MEM Top ${TOP_N} +
PID    USER      CPU   MEM  ELAPSED    CMD
+${top_mem}
+🕒 $(date '+%Y-%m-%d %H:%M:%S') EOF } send_telegram() { local text="$1" - [[ -n "$BOT_TOKEN" ]] || { echo "OPENCLAW_TELEGRAM_BOT_TOKEN 未设置" >&2; return 1; } + if [[ -z "$BOT_TOKEN" ]]; then + echo "Error: BOT_TOKEN is empty." >&2 + return 1 + fi + curl -fsS -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ -d "chat_id=${CHAT_ID}" \ -d "message_thread_id=${TOPIC_ID}" \ - --data-urlencode "text=${text}" \ - -d "disable_web_page_preview=true" + -d "parse_mode=HTML" \ + --data-urlencode "text=${text}" >/dev/null } -main() { - local report - report=$(build_report) - printf '%s\n' "$report" > /home/sinlee/.openclaw/workspace/tmp/server-health-latest.txt - if ((${#report} > MAX_LEN)); then - report="${report:0:MAX_LEN}\n...\n(内容过长,已截断;完整内容见本机 tmp/server-health-latest.txt)" - fi - send_telegram "$report" -} - -main "$@" +report=$(build_report) +send_telegram "$report" +printf '%s\n' "$report"