Improve server health Telegram report
This commit is contained in:
parent
5ce6d074ca
commit
7bfc87993d
|
|
@ -1,23 +1,43 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -Eeuo pipefail
|
set -Eeuo pipefail
|
||||||
|
|
||||||
|
# --- 配置区 ---
|
||||||
CHAT_ID="-1003834524994"
|
CHAT_ID="-1003834524994"
|
||||||
TOPIC_ID="4"
|
TOPIC_ID="4"
|
||||||
BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}"
|
BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}"
|
||||||
MAX_LEN=3800
|
TOP_N="${TOP_N:-5}"
|
||||||
|
# --------------
|
||||||
|
|
||||||
have() { command -v "$1" >/dev/null 2>&1; }
|
have() { command -v "$1" >/dev/null 2>&1; }
|
||||||
|
|
||||||
human_uptime() {
|
escape_html() {
|
||||||
uptime -p 2>/dev/null || true
|
sed -e 's/&/\&/g' -e 's/</\</g' -e 's/>/\>/g'
|
||||||
}
|
}
|
||||||
|
|
||||||
load_avg() {
|
trim_text() {
|
||||||
awk '{print $1, $2, $3}' /proc/loadavg 2>/dev/null || echo "N/A"
|
local max="$1"
|
||||||
|
local text="$2"
|
||||||
|
if (( ${#text} > max )); then
|
||||||
|
printf '%s…' "${text:0:max-1}"
|
||||||
|
else
|
||||||
|
printf '%s' "$text"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_usage_percent() {
|
draw_bar() {
|
||||||
local a b idle_a total_a idle_b total_b usage
|
local percent="$1"
|
||||||
|
local width=10
|
||||||
|
local filled=$(( percent * width / 100 ))
|
||||||
|
(( filled > width )) && filled=$width
|
||||||
|
local empty=$(( width - filled ))
|
||||||
|
local bar=""
|
||||||
|
for ((i=0; i<filled; i++)); do bar+="■"; done
|
||||||
|
for ((i=0; i<empty; i++)); do bar+="□"; done
|
||||||
|
printf '%s' "$bar"
|
||||||
|
}
|
||||||
|
|
||||||
|
get_cpu_usage() {
|
||||||
|
local total_a idle_a total_b idle_b usage
|
||||||
read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat
|
read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat
|
||||||
total_a=$((u1+n1+s1+i1+w1+irq1+sirq1+st1))
|
total_a=$((u1+n1+s1+i1+w1+irq1+sirq1+st1))
|
||||||
idle_a=$((i1+w1))
|
idle_a=$((i1+w1))
|
||||||
|
|
@ -25,132 +45,173 @@ cpu_usage_percent() {
|
||||||
read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat
|
read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat
|
||||||
total_b=$((u2+n2+s2+i2+w2+irq2+sirq2+st2))
|
total_b=$((u2+n2+s2+i2+w2+irq2+sirq2+st2))
|
||||||
idle_b=$((i2+w2))
|
idle_b=$((i2+w2))
|
||||||
usage=$(awk -v ta="$total_a" -v tb="$total_b" -v ia="$idle_a" -v ib="$idle_b" 'BEGIN { dt=tb-ta; di=ib-ia; if (dt<=0) print "0.0"; else printf "%.1f", (dt-di)*100/dt }')
|
usage=$(awk -v ta="$total_a" -v tb="$total_b" -v ia="$idle_a" -v ib="$idle_b" 'BEGIN { dt=tb-ta; di=ib-ia; if (dt<=0) print "0"; else printf "%.0f", (dt-di)*100/dt }')
|
||||||
echo "$usage"
|
printf '%s' "$usage"
|
||||||
}
|
}
|
||||||
|
|
||||||
per_core_overview() {
|
get_mem_info() {
|
||||||
awk '/^cpu[0-9]+ / {print $1}' /proc/stat 2>/dev/null | paste -sd ', ' - | sed 's/^/核心: /' || echo "核心: N/A"
|
free -m | awk '/^Mem:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}'
|
||||||
}
|
}
|
||||||
|
|
||||||
mem_summary() {
|
get_swap_info() {
|
||||||
free -m 2>/dev/null | awk '
|
free -m | awk '/^Swap:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}'
|
||||||
/^Mem:/ {printf "内存: %d/%d MB (%.0f%%)\n", $3, $2, ($3*100/$2)}
|
|
||||||
/^Swap:/ {if ($2==0) printf "Swap: 0/0 MB (未启用)\n"; else printf "Swap: %d/%d MB (%.0f%%)\n", $3, $2, ($3*100/$2)}
|
|
||||||
'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
disk_summary() {
|
get_root_disk_info() {
|
||||||
df -h / 2>/dev/null | awk 'NR==2 {printf "磁盘 /: %s/%s (%s), 可用 %s\n", $3, $2, $5, $4}'
|
df -h / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}'
|
||||||
df -ih / 2>/dev/null | awk 'NR==2 {printf "Inode /: %s/%s (%s), 可用 %s\n", $3, $2, $5, $4}'
|
|
||||||
}
|
}
|
||||||
|
|
||||||
hostname_line() {
|
get_root_inode_info() {
|
||||||
printf "主机: %s\n" "$(hostname 2>/dev/null || echo unknown)"
|
df -i / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}'
|
||||||
}
|
}
|
||||||
|
|
||||||
date_line() {
|
get_loadavg() {
|
||||||
printf "时间: %s\n" "$(date '+%F %T %Z' 2>/dev/null || echo unknown)"
|
awk '{print $1" / "$2" / "$3}' /proc/loadavg
|
||||||
}
|
}
|
||||||
|
|
||||||
top_cpu() {
|
get_uptime() {
|
||||||
ps -eo pid,user,%cpu,%mem,comm --sort=-%cpu 2>/dev/null | head -n 11 | tail -n +2 | awk '{printf "- PID=%s USER=%s CPU=%s%% MEM=%s%% CMD=%s\n", $1,$2,$3,$4,$5}'
|
uptime -p | sed 's/^up //'
|
||||||
}
|
}
|
||||||
|
|
||||||
top_mem() {
|
get_temp() {
|
||||||
ps -eo pid,user,%cpu,%mem,comm --sort=-%mem 2>/dev/null | head -n 11 | tail -n +2 | awk '{printf "- PID=%s USER=%s CPU=%s%% MEM=%s%% CMD=%s\n", $1,$2,$3,$4,$5}'
|
|
||||||
}
|
|
||||||
|
|
||||||
temps_block() {
|
|
||||||
local out=""
|
|
||||||
if have sensors; then
|
if have sensors; then
|
||||||
out=$(sensors 2>/dev/null | sed '/^$/d' | head -n 20 || true)
|
sensors 2>/dev/null | awk '/Package id 0:|Tctl:|Tdie:|Composite:/ {gsub(/^\+/,"",$2); print $1" "$2; count++; if (count>=3) exit}' | paste -sd '; ' -
|
||||||
fi
|
elif [[ -r /sys/class/thermal/thermal_zone0/temp ]]; then
|
||||||
if [[ -z "$out" ]]; then
|
awk '{printf "thermal_zone0 %.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp
|
||||||
out=$(for z in /sys/class/thermal/thermal_zone*; do
|
|
||||||
[[ -r "$z/temp" ]] || continue
|
|
||||||
t=$(cat "$z/type" 2>/dev/null || echo "unknown")
|
|
||||||
v=$(cat "$z/temp" 2>/dev/null || true)
|
|
||||||
[[ -n "$v" ]] || continue
|
|
||||||
awk -v type="$t" -v raw="$v" 'BEGIN { printf "- %s: %.1f°C\n", type, raw/1000 }'
|
|
||||||
done 2>/dev/null | head -n 10)
|
|
||||||
fi
|
|
||||||
if [[ -n "$out" ]]; then
|
|
||||||
printf "%s\n" "$out"
|
|
||||||
else
|
else
|
||||||
echo "- 未获取到温度/传感器数据"
|
printf 'N/A'
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
fans_block() {
|
get_top_partitions() {
|
||||||
local found=0
|
df -hP -x tmpfs -x devtmpfs | awk 'NR==1 {next} {gsub(/%/,"",$5); print $5"\t"$6"\t"$3"/"$2}' | sort -rn | head -n 5 | while IFS=$'\t' read -r usep mount usedtotal; do
|
||||||
for f in /sys/class/hwmon/hwmon*; do
|
printf '%s%% %s (%s)\n' "$usep" "$mount" "$usedtotal"
|
||||||
[[ -d "$f" ]] || continue
|
|
||||||
local name
|
|
||||||
name=$(cat "$f/name" 2>/dev/null || echo "hwmon")
|
|
||||||
for i in "$f"/fan*_input; do
|
|
||||||
[[ -r "$i" ]] || continue
|
|
||||||
found=1
|
|
||||||
printf -- "- %s %s=%s RPM\n" "$name" "$(basename "$i" .*)" "$(cat "$i" 2>/dev/null)"
|
|
||||||
done
|
done
|
||||||
done
|
|
||||||
[[ $found -eq 1 ]] || echo "- 未获取到风扇转速数据"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
risk_block() {
|
get_top_cpu_processes() {
|
||||||
local cpu mem load suspicious="无明显异常"
|
ps -eo pid,user,%cpu,%mem,etime,args --sort=-%cpu | awk 'NR>1 && NR<='"$((5+1))"' {
|
||||||
cpu=$(cpu_usage_percent)
|
cmd="";
|
||||||
mem=$(free -m 2>/dev/null | awk '/^Mem:/ {printf "%.0f", ($3*100/$2)}' || echo 0)
|
for (i=6; i<=NF; i++) cmd=cmd $i (i<NF?" ":"");
|
||||||
load=$(awk '{print $1}' /proc/loadavg 2>/dev/null || echo 0)
|
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,cmd
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
if awk -v v="$cpu" 'BEGIN{exit !(v>=85)}'; then suspicious="CPU 占用偏高,需关注高占用进程"; fi
|
get_top_mem_processes() {
|
||||||
if awk -v v="$mem" 'BEGIN{exit !(v>=85)}'; then suspicious="内存占用偏高,需关注内存泄漏或异常进程"; fi
|
ps -eo pid,user,%cpu,%mem,etime,args --sort=-%mem | awk 'NR>1 && NR<='"$((5+1))"' {
|
||||||
|
cmd="";
|
||||||
|
for (i=6; i<=NF; i++) cmd=cmd $i (i<NF?" ":"");
|
||||||
|
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,cmd
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
echo "- 当前判断: $suspicious"
|
format_proc_lines() {
|
||||||
echo "- 建议关注: 持续高 CPU / 高内存、陌生命令名、异常常驻进程、占用异常升高的解释不清进程"
|
local rows="$1"
|
||||||
|
while IFS=$'\t' read -r pid user pcpu pmem etime cmd; do
|
||||||
|
[[ -z "${pid:-}" ]] && continue
|
||||||
|
cmd=$(trim_text 42 "$cmd")
|
||||||
|
printf '%s\n' "$(printf '%-6s %-8s %5s%% %5s%% %-10s %s' "$pid" "$user" "$pcpu" "$pmem" "$etime" "$cmd")"
|
||||||
|
done <<< "$rows"
|
||||||
|
}
|
||||||
|
|
||||||
|
build_alerts() {
|
||||||
|
local cpu_p="$1" mem_p="$2" disk_p="$3" swap_p="$4" load1="$5" cores="$6"
|
||||||
|
local alerts=()
|
||||||
|
|
||||||
|
(( cpu_p >= 85 )) && alerts+=("CPU ${cpu_p}% 偏高")
|
||||||
|
(( mem_p >= 85 )) && alerts+=("内存 ${mem_p}% 偏高")
|
||||||
|
(( disk_p >= 90 )) && alerts+=("根分区 ${disk_p}% 偏高")
|
||||||
|
(( swap_p >= 30 )) && alerts+=("Swap ${swap_p}% 已使用")
|
||||||
|
awk -v l1="$load1" -v c="$cores" 'BEGIN {exit !(l1>c)}' && alerts+=("1 分钟负载 ${load1} 高于核心数 ${cores}")
|
||||||
|
|
||||||
|
if (( ${#alerts[@]} == 0 )); then
|
||||||
|
printf '✅ 未见明显异常'
|
||||||
|
else
|
||||||
|
printf '⚠️ %s' "$(IFS=';'; echo "${alerts[*]}")"
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
build_report() {
|
build_report() {
|
||||||
|
local cpu_p mem_used mem_total mem_p swap_used swap_total swap_p
|
||||||
|
local disk_used disk_total disk_p inode_used inode_total inode_p
|
||||||
|
local host up load temp status alert_summary cores load1
|
||||||
|
local top_cpu top_mem top_parts
|
||||||
|
|
||||||
|
cpu_p=$(get_cpu_usage)
|
||||||
|
read -r mem_used mem_total mem_p < <(get_mem_info)
|
||||||
|
read -r swap_used swap_total swap_p < <(get_swap_info)
|
||||||
|
read -r disk_used disk_total disk_p < <(get_root_disk_info)
|
||||||
|
read -r inode_used inode_total inode_p < <(get_root_inode_info)
|
||||||
|
load=$(get_loadavg)
|
||||||
|
load1=$(awk '{print $1}' /proc/loadavg)
|
||||||
|
up=$(get_uptime)
|
||||||
|
host=$(hostname)
|
||||||
|
temp=$(get_temp)
|
||||||
|
cores=$(nproc)
|
||||||
|
top_cpu=$(format_proc_lines "$(get_top_cpu_processes)")
|
||||||
|
top_mem=$(format_proc_lines "$(get_top_mem_processes)")
|
||||||
|
top_parts=$(get_top_partitions)
|
||||||
|
alert_summary=$(build_alerts "$cpu_p" "$mem_p" "$disk_p" "$swap_p" "$load1" "$cores")
|
||||||
|
|
||||||
|
if [[ "$alert_summary" == ✅* ]]; then
|
||||||
|
status='🟢 正常'
|
||||||
|
elif (( cpu_p >= 95 || mem_p >= 95 || disk_p >= 95 )); then
|
||||||
|
status='🔴 告警'
|
||||||
|
else
|
||||||
|
status='🟡 关注'
|
||||||
|
fi
|
||||||
|
|
||||||
|
temp=$(printf '%s' "$temp" | escape_html)
|
||||||
|
top_cpu=$(printf '%s' "$top_cpu" | escape_html)
|
||||||
|
top_mem=$(printf '%s' "$top_mem" | escape_html)
|
||||||
|
top_parts=$(printf '%s' "$top_parts" | escape_html)
|
||||||
|
alert_summary=$(printf '%s' "$alert_summary" | escape_html)
|
||||||
|
|
||||||
cat <<EOF
|
cat <<EOF
|
||||||
🖥️ 服务器监控报告
|
<b>🖥 服务器巡检 · ${host}</b>
|
||||||
$(hostname_line)$(date_line)
|
状态:<b>${status}</b>
|
||||||
【系统概览】
|
${alert_summary}
|
||||||
- Uptime: $(human_uptime)
|
━━━━━━━━━━━━━━━━━━
|
||||||
- CPU 总使用率: $(cpu_usage_percent)%
|
<b>核心指标</b>
|
||||||
- Load Average: $(load_avg)
|
CPU <code>$(draw_bar "$cpu_p")</code> ${cpu_p}%
|
||||||
- $(per_core_overview)
|
RAM <code>$(draw_bar "$mem_p")</code> ${mem_p}%
|
||||||
$(mem_summary)$(disk_summary)
|
DSK <code>$(draw_bar "$disk_p")</code> ${disk_p}%
|
||||||
【CPU 占用 Top 10】
|
|
||||||
$(top_cpu)
|
<b>概览</b>
|
||||||
【内存占用 Top 10】
|
• 运行时间:<code>${up}</code>
|
||||||
$(top_mem)
|
• 负载(1/5/15):<code>${load}</code>
|
||||||
【温度 / 传感器】
|
• 内存:<code>${mem_used}MB / ${mem_total}MB</code>
|
||||||
$(temps_block)
|
• Swap:<code>${swap_used}MB / ${swap_total}MB (${swap_p}%)</code>
|
||||||
【风扇】
|
• 根分区:<code>${disk_used} / ${disk_total}</code>
|
||||||
$(fans_block)
|
• Inode:<code>${inode_used} / ${inode_total} (${inode_p}%)</code>
|
||||||
【巡检结论】
|
• 温度:<code>${temp}</code>
|
||||||
$(risk_block)
|
|
||||||
|
<b>最忙分区 Top 5</b>
|
||||||
|
<pre>${top_parts}</pre>
|
||||||
|
<b>CPU Top ${TOP_N}</b>
|
||||||
|
<pre>PID USER CPU MEM ELAPSED CMD
|
||||||
|
${top_cpu}</pre>
|
||||||
|
<b>MEM Top ${TOP_N}</b>
|
||||||
|
<pre>PID USER CPU MEM ELAPSED CMD
|
||||||
|
${top_mem}</pre>
|
||||||
|
🕒 <i>$(date '+%Y-%m-%d %H:%M:%S')</i>
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
send_telegram() {
|
send_telegram() {
|
||||||
local text="$1"
|
local text="$1"
|
||||||
[[ -n "$BOT_TOKEN" ]] || { echo "OPENCLAW_TELEGRAM_BOT_TOKEN 未设置" >&2; return 1; }
|
if [[ -z "$BOT_TOKEN" ]]; then
|
||||||
|
echo "Error: BOT_TOKEN is empty." >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
curl -fsS -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
|
curl -fsS -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
|
||||||
-d "chat_id=${CHAT_ID}" \
|
-d "chat_id=${CHAT_ID}" \
|
||||||
-d "message_thread_id=${TOPIC_ID}" \
|
-d "message_thread_id=${TOPIC_ID}" \
|
||||||
--data-urlencode "text=${text}" \
|
-d "parse_mode=HTML" \
|
||||||
-d "disable_web_page_preview=true"
|
--data-urlencode "text=${text}" >/dev/null
|
||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
|
||||||
local report
|
|
||||||
report=$(build_report)
|
report=$(build_report)
|
||||||
printf '%s\n' "$report" > /home/sinlee/.openclaw/workspace/tmp/server-health-latest.txt
|
|
||||||
if ((${#report} > MAX_LEN)); then
|
|
||||||
report="${report:0:MAX_LEN}\n...\n(内容过长,已截断;完整内容见本机 tmp/server-health-latest.txt)"
|
|
||||||
fi
|
|
||||||
send_telegram "$report"
|
send_telegram "$report"
|
||||||
}
|
printf '%s\n' "$report"
|
||||||
|
|
||||||
main "$@"
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue