openclaw-sync/scripts/server-health-report.sh

118 lines
3.9 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -Eeuo pipefail
# --- 配置区 ---
CHAT_ID="-1003834524994"
TOPIC_ID="4"
BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}"
TOP_N=5
# --------------
have() { command -v "$1" >/dev/null 2>&1; }
draw_bar() {
local percent=$1
local width=10
local filled=$(( percent * width / 100 ))
[[ $filled -gt $width ]] && filled=$width
local empty=$(( width - filled ))
local bar=""
for ((i=0; i<filled; i++)); do bar+="■"; done
for ((i=0; i<empty; i++)); do bar+="□"; done
echo "$bar"
}
# --- 数据采集函数 ---
get_cpu_p() {
read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat
sleep 1
read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat
local total=$(( (u2+n2+s2+i2+w2+irq2+sirq2+st2) - (u1+n1+s1+i1+w1+irq1+sirq1+st1) ))
local idle=$(( (i2+w2) - (i1+w1) ))
awk -v t="$total" -v i="$idle" 'BEGIN {if(t<=0) print "0"; else printf "%.0f", (t-i)*100/t}'
}
get_mem_data() {
free -m | awk '/^Mem:/ {printf "%d %d %d", $3, $2, $3*100/$2} /^Swap:/ {printf " %d %d %d", $3, $2, ($2>0?$3*100/$2:0)}'
}
get_root_disk() {
df -h / | awk 'NR==2 {print $3, $2, $5}' | tr -d '%'
}
get_inode() {
df -i / | awk 'NR==2 {printf "%s / %s (%s)", $3, $2, $5}'
}
get_top_partitions() {
# 排除虚拟文件系统与无关 EFI 分区按占用率排序取前3
df -h | grep -vE '^tmpfs|cdrom|loop|udev' | awk 'NR>1 && $6 != "/boot/efi" && $6 != "/sys/firmware/efi/efivars" {print $5, $6, $3, $2}' | sort -rn | head -n 3 | \
awk '{printf "%-4s %-12s (%s/%s)\n", $1, $2, $3, $4}'
}
get_top_procs() {
local sort_type=$1 # %cpu 或 %mem
# PID, USER, CPU, MEM, TIME, COMMAND
ps -eo pid,user,%cpu,%mem,etime,comm --sort=-"$sort_type" | head -n $((TOP_N+1)) | tail -n +2 | \
awk '{printf "%-6s %-4s %-4s %s\n", $1, $3"%", $4"%", $6}'
}
# --- 报告生成 ---
build_report() {
local cpu_p m_usd m_tot m_p s_usd s_tot s_p d_usd d_tot d_p host up load temp icon ip
# 采集
cpu_p=$(get_cpu_p)
read -r m_usd m_tot m_p s_usd s_tot s_p < <(get_mem_data)
read -r d_usd d_tot d_p < <(get_root_disk)
host=$(hostname)
ip=$(hostname -I | awk '{print $1}')
up=$(uptime -p | sed 's/^up //')
load=$(awk '{print $1" / "$2" / "$3}' /proc/loadavg)
temp=$( (sensors 2>/dev/null | awk '/°C/ {print $2; exit}' | tr -d '+') || (awk '{printf "%.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp 2>/dev/null) || echo "N/A" )
# 状态判断
if [ "$cpu_p" -gt 85 ] || [ "$m_p" -gt 90 ]; then icon="🔴 Critical"; status="⚠️ 资源占用过高";
else icon="🟢 Healthy"; status="✅ 未见明显异常"; fi
cat <<EOF
<b>🖥 Server: ${host} (${ip})</b>
状态: <b>${icon}</b>
${status}
━━━━━━━━━━━━━━━━━━
<b>核心指标:</b>
<b>CPU </b> <code>$(draw_bar $cpu_p)</code> <code>$(printf "%3d" $cpu_p)%</code>
<b>RAM </b> <code>$(draw_bar $m_p)</code> <code>$(printf "%3d" $m_p)%</code>
<b>DSK </b> <code>$(draw_bar $d_p)</code> <code>$(printf "%3d" $d_p)%</code>
<b>📊 详细概览:</b>
• <b>运行时间:</b> <code>$up</code>
• <b>平均负载:</b> <code>$load</code>
• <b>内存详情:</b> <code>${m_usd}MB / ${m_tot}MB</code>
• <b>交换分区:</b> <code>${s_usd}MB / ${s_tot}MB (${s_p}%)</code>
• <b>根分区:</b> <code>${d_usd} / ${d_tot}</code>
• <b>Inode:</b> <code>$(get_inode)</code>
• <b>系统温度:</b> <code>$temp</code>
<b>🗄 繁忙分区 Top 5:</b>
<pre>$(get_top_partitions)</pre>
<b>🔥 CPU Top 5 (PID/CPU/MEM/CMD):</b>
<pre>$(get_top_procs %cpu)</pre>
<b>🧠 MEM Top 5 (PID/CPU/MEM/CMD):</b>
<pre>$(get_top_procs %mem)</pre>
🕒 <i>$(date '+%Y-%m-%d %H:%M:%S')</i>
EOF
}
send_telegram() {
local text="$1"
[[ -z "$BOT_TOKEN" ]] && return 1
curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
-d "chat_id=${CHAT_ID}" \
-d "message_thread_id=${TOPIC_ID}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${text}" > /dev/null
}
send_telegram "$(build_report)"