diff --git a/scripts/server-health-report.sh b/scripts/server-health-report.sh old mode 100755 new mode 100644 index c00981d..b5257fb --- a/scripts/server-health-report.sh +++ b/scripts/server-health-report.sh @@ -5,213 +5,113 @@ set -Eeuo pipefail CHAT_ID="-1003834524994" TOPIC_ID="4" BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}" -TOP_N="${TOP_N:-5}" +TOP_N=5 # -------------- have() { command -v "$1" >/dev/null 2>&1; } -escape_html() { - sed -e 's/&/\&/g' -e 's//\>/g' -} - -trim_text() { - local max="$1" - local text="$2" - if (( ${#text} > max )); then - printf '%s…' "${text:0:max-1}" - else - printf '%s' "$text" - fi -} - draw_bar() { - local percent="$1" + local percent=$1 local width=10 local filled=$(( percent * width / 100 )) - (( filled > width )) && filled=$width + [[ $filled -gt $width ]] && filled=$width local empty=$(( width - filled )) local bar="" for ((i=0; i0 ? $3*100/$2 : 0)}' +get_mem_data() { + free -m | awk '/^Mem:/ {printf "%d %d %d", $3, $2, $3*100/$2} /^Swap:/ {printf " %d %d %d", $3, $2, ($2>0?$3*100/$2:0)}' } -get_swap_info() { - free -m | awk '/^Swap:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}' +get_root_disk() { + df -h / | awk 'NR==2 {print $3, $2, $5}' | tr -d '%' } -get_root_disk_info() { - df -h / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}' -} - -get_root_inode_info() { - df -i / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}' -} - -get_loadavg() { - awk '{print $1" / "$2" / "$3}' /proc/loadavg -} - -get_uptime() { - uptime -p | sed 's/^up //' -} - -get_temp() { - if have sensors; then - sensors 2>/dev/null | awk '/Package id 0:|Tctl:|Tdie:|Composite:/ {gsub(/^\+/,"",$2); print $1" "$2; count++; if (count>=3) exit}' | paste -sd '; ' - - elif [[ -r /sys/class/thermal/thermal_zone0/temp ]]; then - awk '{printf "thermal_zone0 %.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp - else - printf 'N/A' - fi +get_inode() { + df -i / | awk 'NR==2 {printf "%s / %s (%s)", $3, $2, $5}' } get_top_partitions() { - df -hP -x tmpfs -x devtmpfs | awk 'NR==1 {next} {gsub(/%/,"",$5); print $5"\t"$6"\t"$3"/"$2}' | sort -rn | head -n 5 | while IFS=$'\t' read -r usep mount usedtotal; do - printf '%s%% %s (%s)\n' "$usep" "$mount" "$usedtotal" - done + # 排除虚拟文件系统与无关 EFI 分区,按占用率排序取前3 + df -h | grep -vE '^tmpfs|cdrom|loop|udev' | awk 'NR>1 && $6 != "/boot/efi" && $6 != "/sys/firmware/efi/efivars" {print $5, $6, $3, $2}' | sort -rn | head -n 3 | \ + awk '{printf "%-4s %-12s (%s/%s)\n", $1, $2, $3, $4}' } -get_top_cpu_processes() { - ps -eo pid,user,%cpu,%mem,etime,args --sort=-%cpu | awk 'NR>1 && NR<='"$((5+1))"' { - cmd=""; - for (i=6; i<=NF; i++) cmd=cmd $i (i1 && NR<='"$((5+1))"' { - cmd=""; - for (i=6; i<=NF; i++) cmd=cmd $i (i= 85 )) && alerts+=("CPU ${cpu_p}% 偏高") - (( mem_p >= 85 )) && alerts+=("内存 ${mem_p}% 偏高") - (( disk_p >= 90 )) && alerts+=("根分区 ${disk_p}% 偏高") - (( swap_p >= 30 )) && alerts+=("Swap ${swap_p}% 已使用") - awk -v l1="$load1" -v c="$cores" 'BEGIN {exit !(l1>c)}' && alerts+=("1 分钟负载 ${load1} 高于核心数 ${cores}") - - if (( ${#alerts[@]} == 0 )); then - printf '✅ 未见明显异常' - else - printf '⚠️ %s' "$(IFS=';'; echo "${alerts[*]}")" - fi +get_top_procs() { + local sort_type=$1 # %cpu 或 %mem + # PID, USER, CPU, MEM, TIME, COMMAND + ps -eo pid,user,%cpu,%mem,etime,comm --sort=-"$sort_type" | head -n $((TOP_N+1)) | tail -n +2 | \ + awk '{printf "%-6s %-4s %-4s %s\n", $1, $3"%", $4"%", $6}' } +# --- 报告生成 --- build_report() { - local cpu_p mem_used mem_total mem_p swap_used swap_total swap_p - local disk_used disk_total disk_p inode_used inode_total inode_p - local host up load temp status alert_summary cores load1 - local top_cpu top_mem top_parts - - cpu_p=$(get_cpu_usage) - read -r mem_used mem_total mem_p < <(get_mem_info) - read -r swap_used swap_total swap_p < <(get_swap_info) - read -r disk_used disk_total disk_p < <(get_root_disk_info) - read -r inode_used inode_total inode_p < <(get_root_inode_info) - load=$(get_loadavg) - load1=$(awk '{print $1}' /proc/loadavg) - up=$(get_uptime) + local cpu_p m_usd m_tot m_p s_usd s_tot s_p d_usd d_tot d_p host up load temp icon ip + + # 采集 + cpu_p=$(get_cpu_p) + read -r m_usd m_tot m_p s_usd s_tot s_p < <(get_mem_data) + read -r d_usd d_tot d_p < <(get_root_disk) host=$(hostname) - temp=$(get_temp) - cores=$(nproc) - top_cpu=$(format_proc_lines "$(get_top_cpu_processes)") - top_mem=$(format_proc_lines "$(get_top_mem_processes)") - top_parts=$(get_top_partitions) - alert_summary=$(build_alerts "$cpu_p" "$mem_p" "$disk_p" "$swap_p" "$load1" "$cores") + ip=$(hostname -I | awk '{print $1}') + up=$(uptime -p | sed 's/^up //') + load=$(awk '{print $1" / "$2" / "$3}' /proc/loadavg) + temp=$( (sensors 2>/dev/null | awk '/°C/ {print $2; exit}' | tr -d '+') || (awk '{printf "%.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp 2>/dev/null) || echo "N/A" ) - if [[ "$alert_summary" == ✅* ]]; then - status='🟢 正常' - elif (( cpu_p >= 95 || mem_p >= 95 || disk_p >= 95 )); then - status='🔴 告警' - else - status='🟡 关注' - fi - - temp=$(printf '%s' "$temp" | escape_html) - top_cpu=$(printf '%s' "$top_cpu" | escape_html) - top_mem=$(printf '%s' "$top_mem" | escape_html) - top_parts=$(printf '%s' "$top_parts" | escape_html) - alert_summary=$(printf '%s' "$alert_summary" | escape_html) + # 状态判断 + if [ "$cpu_p" -gt 85 ] || [ "$m_p" -gt 90 ]; then icon="🔴 Critical"; status="⚠️ 资源占用过高"; + else icon="🟢 Healthy"; status="✅ 未见明显异常"; fi cat <🖥 服务器巡检 · ${host} -状态:${status} -${alert_summary} +🖥 Server: ${host} (${ip}) +状态: ${icon} +${status} ━━━━━━━━━━━━━━━━━━ -核心指标 -CPU $(draw_bar "$cpu_p") ${cpu_p}% -RAM $(draw_bar "$mem_p") ${mem_p}% -DSK $(draw_bar "$disk_p") ${disk_p}% +核心指标: +CPU $(draw_bar $cpu_p) $(printf "%3d" $cpu_p)% +RAM $(draw_bar $m_p) $(printf "%3d" $m_p)% +DSK $(draw_bar $d_p) $(printf "%3d" $d_p)% -概览 -• 运行时间:${up} -• 负载(1/5/15):${load} -• 内存:${mem_used}MB / ${mem_total}MB -• Swap:${swap_used}MB / ${swap_total}MB (${swap_p}%) -• 根分区:${disk_used} / ${disk_total} -• Inode:${inode_used} / ${inode_total} (${inode_p}%) -• 温度:${temp} +📊 详细概览: +• 运行时间: $up +• 平均负载: $load +• 内存详情: ${m_usd}MB / ${m_tot}MB +• 交换分区: ${s_usd}MB / ${s_tot}MB (${s_p}%) +• 根分区: ${d_usd} / ${d_tot} +• Inode: $(get_inode) +• 系统温度: $temp -最忙分区 Top 5 -
${top_parts}
-CPU Top ${TOP_N} -
PID    USER      CPU   MEM  ELAPSED    CMD
-${top_cpu}
-MEM Top ${TOP_N} -
PID    USER      CPU   MEM  ELAPSED    CMD
-${top_mem}
+🗄 繁忙分区 Top 5: +
$(get_top_partitions)
+🔥 CPU Top 5 (PID/CPU/MEM/CMD): +
$(get_top_procs %cpu)
+🧠 MEM Top 5 (PID/CPU/MEM/CMD): +
$(get_top_procs %mem)
🕒 $(date '+%Y-%m-%d %H:%M:%S') EOF } send_telegram() { local text="$1" - if [[ -z "$BOT_TOKEN" ]]; then - echo "Error: BOT_TOKEN is empty." >&2 - return 1 - fi - - curl -fsS -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ + [[ -z "$BOT_TOKEN" ]] && return 1 + curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \ -d "chat_id=${CHAT_ID}" \ -d "message_thread_id=${TOPIC_ID}" \ -d "parse_mode=HTML" \ - --data-urlencode "text=${text}" >/dev/null + --data-urlencode "text=${text}" > /dev/null } -report=$(build_report) -send_telegram "$report" -printf '%s\n' "$report" +send_telegram "$(build_report)"