openclaw-sync/scripts/server-health-report.sh

128 lines
4.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -Eeuo pipefail
# --- 配置区 ---
CHAT_ID="-1003834524994"
TOPIC_ID="4"
BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}"
TOP_N=5
# --------------
have() { command -v "$1" >/dev/null 2>&1; }
draw_bar() {
local percent=$1
local width=10
local filled=$(( percent * width / 100 ))
[[ $filled -gt $width ]] && filled=$width
local empty=$(( width - filled ))
local bar=""
for ((i=0; i<filled; i++)); do bar+="■"; done
for ((i=0; i<empty; i++)); do bar+="□"; done
echo "$bar"
}
trim_cmd() {
local s="$1"
local max="${2:-36}"
if (( ${#s} > max )); then
printf '%s…' "${s:0:max-1}"
else
printf '%s' "$s"
fi
}
# --- 数据采集函数 ---
get_cpu_p() {
read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat
sleep 1
read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat
local total=$(( (u2+n2+s2+i2+w2+irq2+sirq2+st2) - (u1+n1+s1+i1+w1+irq1+sirq1+st1) ))
local idle=$(( (i2+w2) - (i1+w1) ))
awk -v t="$total" -v i="$idle" 'BEGIN {if(t<=0) print "0"; else printf "%.0f", (t-i)*100/t}'
}
get_mem_data() {
free -m | awk '/^Mem:/ {printf "%d %d %d", $3, $2, $3*100/$2} /^Swap:/ {printf " %d %d %d", $3, $2, ($2>0?$3*100/$2:0)}'
}
get_root_disk() {
df -h / | awk 'NR==2 {print $3, $2, $5}' | tr -d '%'
}
get_inode() {
df -i / | awk 'NR==2 {printf "%s / %s (%s)", $3, $2, $5}'
}
get_top_partitions() {
df -h | grep -vE '^tmpfs|cdrom|loop|udev' | awk 'NR>1 && $6 != "/boot/efi" && $6 != "/sys/firmware/efi/efivars" {print $5, $6, $3, $2}' | sort -rn | head -n 3 | \
awk '{printf "%-4s %-12s (%s/%s)\n", $1, $2, $3, $4}'
}
get_top_procs() {
local sort_type=$1
ps -eo pid,user,%cpu,%mem,etime,args --sort=-"$sort_type" | head -n $((TOP_N+1)) | tail -n +2 | \
while read -r pid user cpu mem etime rest; do
local display
display=$(trim_cmd "$rest" 44)
printf "%-6s %-7s %-5s %-5s %-10s %s\n" "$pid" "$user" "${cpu}%" "${mem}%" "$etime" "$display"
done
}
# --- 报告生成 ---
build_report() {
local cpu_p m_usd m_tot m_p s_usd s_tot s_p d_usd d_tot d_p host up load temp icon ip status
cpu_p=$(get_cpu_p)
read -r m_usd m_tot m_p s_usd s_tot s_p < <(get_mem_data)
read -r d_usd d_tot d_p < <(get_root_disk)
host=$(hostname)
ip=$(hostname -I | awk '{print $1}')
up=$(uptime -p | sed 's/^up //')
load=$(awk '{print $1" / "$2" / "$3}' /proc/loadavg)
temp=$( (sensors 2>/dev/null | awk '/°C/ {print $2; exit}' | tr -d '+') || (awk '{printf "%.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp 2>/dev/null) || echo "N/A" )
if [ "$cpu_p" -gt 85 ] || [ "$m_p" -gt 90 ]; then icon="🔴 Critical"; status="⚠️ 资源占用过高";
else icon="🟢 Healthy"; status="✅ 未见明显异常"; fi
cat <<EOF
<b>🖥 Server: ${host} (${ip})</b>
状态: <b>${icon}</b>
${status}
━━━━━━━━━━━━━━━━━━
<b>核心指标:</b>
<b>CPU </b> <code>$(draw_bar $cpu_p)</code> <code>$(printf "%3d" $cpu_p)%</code>
<b>RAM </b> <code>$(draw_bar $m_p)</code> <code>$(printf "%3d" $m_p)%</code>
<b>DSK </b> <code>$(draw_bar $d_p)</code> <code>$(printf "%3d" $d_p)%</code>
<b>📊 详细概览:</b>
• <b>运行时间:</b> <code>$up</code>
• <b>平均负载:</b> <code>$load</code>
• <b>内存详情:</b> <code>${m_usd}MB / ${m_tot}MB</code>
• <b>交换分区:</b> <code>${s_usd}MB / ${s_tot}MB (${s_p}%)</code>
• <b>根分区:</b> <code>${d_usd} / ${d_tot}</code>
• <b>Inode:</b> <code>$(get_inode)</code>
• <b>系统温度:</b> <code>$temp</code>
<b>🗄 繁忙分区 Top 3:</b>
<pre>$(get_top_partitions)</pre>
<b>🔥 CPU Top ${TOP_N} (PID/USER/CPU/MEM/TIME/CMD):</b>
<pre>$(get_top_procs %cpu)</pre>
<b>🧠 MEM Top ${TOP_N} (PID/USER/CPU/MEM/TIME/CMD):</b>
<pre>$(get_top_procs %mem)</pre>
🕒 <i>$(date '+%Y-%m-%d %H:%M:%S')</i>
EOF
}
send_telegram() {
local text="$1"
[[ -z "$BOT_TOKEN" ]] && return 1
curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
-d "chat_id=${CHAT_ID}" \
-d "message_thread_id=${TOPIC_ID}" \
-d "parse_mode=HTML" \
--data-urlencode "text=${text}" > /dev/null
}
send_telegram "$(build_report)"