Filter EFI partitions from health report

This commit is contained in:
Lee 2026-03-27 17:21:52 +08:00
parent 7bfc87993d
commit b4267b2f53
1 changed files with 62 additions and 162 deletions

224
scripts/server-health-report.sh Executable file → Normal file
View File

@ -5,213 +5,113 @@ set -Eeuo pipefail
CHAT_ID="-1003834524994" CHAT_ID="-1003834524994"
TOPIC_ID="4" TOPIC_ID="4"
BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}" BOT_TOKEN="${OPENCLAW_TELEGRAM_BOT_TOKEN:-}"
TOP_N="${TOP_N:-5}" TOP_N=5
# -------------- # --------------
have() { command -v "$1" >/dev/null 2>&1; } have() { command -v "$1" >/dev/null 2>&1; }
escape_html() {
sed -e 's/&/\&amp;/g' -e 's/</\&lt;/g' -e 's/>/\&gt;/g'
}
trim_text() {
local max="$1"
local text="$2"
if (( ${#text} > max )); then
printf '%s…' "${text:0:max-1}"
else
printf '%s' "$text"
fi
}
draw_bar() { draw_bar() {
local percent="$1" local percent=$1
local width=10 local width=10
local filled=$(( percent * width / 100 )) local filled=$(( percent * width / 100 ))
(( filled > width )) && filled=$width [[ $filled -gt $width ]] && filled=$width
local empty=$(( width - filled )) local empty=$(( width - filled ))
local bar="" local bar=""
for ((i=0; i<filled; i++)); do bar+="■"; done for ((i=0; i<filled; i++)); do bar+="■"; done
for ((i=0; i<empty; i++)); do bar+="□"; done for ((i=0; i<empty; i++)); do bar+="□"; done
printf '%s' "$bar" echo "$bar"
} }
get_cpu_usage() { # --- 数据采集函数 ---
local total_a idle_a total_b idle_b usage get_cpu_p() {
read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat read -r _ u1 n1 s1 i1 w1 irq1 sirq1 st1 _ < /proc/stat
total_a=$((u1+n1+s1+i1+w1+irq1+sirq1+st1))
idle_a=$((i1+w1))
sleep 1 sleep 1
read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat read -r _ u2 n2 s2 i2 w2 irq2 sirq2 st2 _ < /proc/stat
total_b=$((u2+n2+s2+i2+w2+irq2+sirq2+st2)) local total=$(( (u2+n2+s2+i2+w2+irq2+sirq2+st2) - (u1+n1+s1+i1+w1+irq1+sirq1+st1) ))
idle_b=$((i2+w2)) local idle=$(( (i2+w2) - (i1+w1) ))
usage=$(awk -v ta="$total_a" -v tb="$total_b" -v ia="$idle_a" -v ib="$idle_b" 'BEGIN { dt=tb-ta; di=ib-ia; if (dt<=0) print "0"; else printf "%.0f", (dt-di)*100/dt }') awk -v t="$total" -v i="$idle" 'BEGIN {if(t<=0) print "0"; else printf "%.0f", (t-i)*100/t}'
printf '%s' "$usage"
} }
get_mem_info() { get_mem_data() {
free -m | awk '/^Mem:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}' free -m | awk '/^Mem:/ {printf "%d %d %d", $3, $2, $3*100/$2} /^Swap:/ {printf " %d %d %d", $3, $2, ($2>0?$3*100/$2:0)}'
} }
get_swap_info() { get_root_disk() {
free -m | awk '/^Swap:/ {printf "%d %d %d\n", $3, $2, ($2>0 ? $3*100/$2 : 0)}' df -h / | awk 'NR==2 {print $3, $2, $5}' | tr -d '%'
} }
get_root_disk_info() { get_inode() {
df -h / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}' df -i / | awk 'NR==2 {printf "%s / %s (%s)", $3, $2, $5}'
}
get_root_inode_info() {
df -i / | awk 'NR==2 {gsub(/%/,"",$5); printf "%s %s %s\n", $3, $2, $5}'
}
get_loadavg() {
awk '{print $1" / "$2" / "$3}' /proc/loadavg
}
get_uptime() {
uptime -p | sed 's/^up //'
}
get_temp() {
if have sensors; then
sensors 2>/dev/null | awk '/Package id 0:|Tctl:|Tdie:|Composite:/ {gsub(/^\+/,"",$2); print $1" "$2; count++; if (count>=3) exit}' | paste -sd '; ' -
elif [[ -r /sys/class/thermal/thermal_zone0/temp ]]; then
awk '{printf "thermal_zone0 %.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp
else
printf 'N/A'
fi
} }
get_top_partitions() { get_top_partitions() {
df -hP -x tmpfs -x devtmpfs | awk 'NR==1 {next} {gsub(/%/,"",$5); print $5"\t"$6"\t"$3"/"$2}' | sort -rn | head -n 5 | while IFS=$'\t' read -r usep mount usedtotal; do # 排除虚拟文件系统与无关 EFI 分区按占用率排序取前3
printf '%s%% %s (%s)\n' "$usep" "$mount" "$usedtotal" df -h | grep -vE '^tmpfs|cdrom|loop|udev' | awk 'NR>1 && $6 != "/boot/efi" && $6 != "/sys/firmware/efi/efivars" {print $5, $6, $3, $2}' | sort -rn | head -n 3 | \
done awk '{printf "%-4s %-12s (%s/%s)\n", $1, $2, $3, $4}'
} }
get_top_cpu_processes() { get_top_procs() {
ps -eo pid,user,%cpu,%mem,etime,args --sort=-%cpu | awk 'NR>1 && NR<='"$((5+1))"' { local sort_type=$1 # %cpu 或 %mem
cmd=""; # PID, USER, CPU, MEM, TIME, COMMAND
for (i=6; i<=NF; i++) cmd=cmd $i (i<NF?" ":""); ps -eo pid,user,%cpu,%mem,etime,comm --sort=-"$sort_type" | head -n $((TOP_N+1)) | tail -n +2 | \
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,cmd awk '{printf "%-6s %-4s %-4s %s\n", $1, $3"%", $4"%", $6}'
}'
}
get_top_mem_processes() {
ps -eo pid,user,%cpu,%mem,etime,args --sort=-%mem | awk 'NR>1 && NR<='"$((5+1))"' {
cmd="";
for (i=6; i<=NF; i++) cmd=cmd $i (i<NF?" ":"");
printf "%s\t%s\t%s\t%s\t%s\t%s\n", $1,$2,$3,$4,$5,cmd
}'
}
format_proc_lines() {
local rows="$1"
while IFS=$'\t' read -r pid user pcpu pmem etime cmd; do
[[ -z "${pid:-}" ]] && continue
cmd=$(trim_text 42 "$cmd")
printf '%s\n' "$(printf '%-6s %-8s %5s%% %5s%% %-10s %s' "$pid" "$user" "$pcpu" "$pmem" "$etime" "$cmd")"
done <<< "$rows"
}
build_alerts() {
local cpu_p="$1" mem_p="$2" disk_p="$3" swap_p="$4" load1="$5" cores="$6"
local alerts=()
(( cpu_p >= 85 )) && alerts+=("CPU ${cpu_p}% 偏高")
(( mem_p >= 85 )) && alerts+=("内存 ${mem_p}% 偏高")
(( disk_p >= 90 )) && alerts+=("根分区 ${disk_p}% 偏高")
(( swap_p >= 30 )) && alerts+=("Swap ${swap_p}% 已使用")
awk -v l1="$load1" -v c="$cores" 'BEGIN {exit !(l1>c)}' && alerts+=("1 分钟负载 ${load1} 高于核心数 ${cores}")
if (( ${#alerts[@]} == 0 )); then
printf '✅ 未见明显异常'
else
printf '⚠️ %s' "$(IFS=''; echo "${alerts[*]}")"
fi
} }
# --- 报告生成 ---
build_report() { build_report() {
local cpu_p mem_used mem_total mem_p swap_used swap_total swap_p local cpu_p m_usd m_tot m_p s_usd s_tot s_p d_usd d_tot d_p host up load temp icon ip
local disk_used disk_total disk_p inode_used inode_total inode_p
local host up load temp status alert_summary cores load1 # 采集
local top_cpu top_mem top_parts cpu_p=$(get_cpu_p)
read -r m_usd m_tot m_p s_usd s_tot s_p < <(get_mem_data)
cpu_p=$(get_cpu_usage) read -r d_usd d_tot d_p < <(get_root_disk)
read -r mem_used mem_total mem_p < <(get_mem_info)
read -r swap_used swap_total swap_p < <(get_swap_info)
read -r disk_used disk_total disk_p < <(get_root_disk_info)
read -r inode_used inode_total inode_p < <(get_root_inode_info)
load=$(get_loadavg)
load1=$(awk '{print $1}' /proc/loadavg)
up=$(get_uptime)
host=$(hostname) host=$(hostname)
temp=$(get_temp) ip=$(hostname -I | awk '{print $1}')
cores=$(nproc) up=$(uptime -p | sed 's/^up //')
top_cpu=$(format_proc_lines "$(get_top_cpu_processes)") load=$(awk '{print $1" / "$2" / "$3}' /proc/loadavg)
top_mem=$(format_proc_lines "$(get_top_mem_processes)") temp=$( (sensors 2>/dev/null | awk '/°C/ {print $2; exit}' | tr -d '+') || (awk '{printf "%.1f°C", $1/1000}' /sys/class/thermal/thermal_zone0/temp 2>/dev/null) || echo "N/A" )
top_parts=$(get_top_partitions)
alert_summary=$(build_alerts "$cpu_p" "$mem_p" "$disk_p" "$swap_p" "$load1" "$cores")
if [[ "$alert_summary" == ✅* ]]; then # 状态判断
status='🟢 正常' if [ "$cpu_p" -gt 85 ] || [ "$m_p" -gt 90 ]; then icon="🔴 Critical"; status="⚠️ 资源占用过高";
elif (( cpu_p >= 95 || mem_p >= 95 || disk_p >= 95 )); then else icon="🟢 Healthy"; status="✅ 未见明显异常"; fi
status='🔴 告警'
else
status='🟡 关注'
fi
temp=$(printf '%s' "$temp" | escape_html)
top_cpu=$(printf '%s' "$top_cpu" | escape_html)
top_mem=$(printf '%s' "$top_mem" | escape_html)
top_parts=$(printf '%s' "$top_parts" | escape_html)
alert_summary=$(printf '%s' "$alert_summary" | escape_html)
cat <<EOF cat <<EOF
<b>🖥 服务器巡检 · ${host}</b> <b>🖥 Server: ${host} (${ip})</b>
状态<b>${status}</b> 状态: <b>${icon}</b>
${alert_summary} ${status}
━━━━━━━━━━━━━━━━━━ ━━━━━━━━━━━━━━━━━━
<b>核心指标</b> <b>核心指标:</b>
CPU <code>$(draw_bar "$cpu_p")</code> ${cpu_p}% <b>CPU </b> <code>$(draw_bar $cpu_p)</code> <code>$(printf "%3d" $cpu_p)%</code>
RAM <code>$(draw_bar "$mem_p")</code> ${mem_p}% <b>RAM </b> <code>$(draw_bar $m_p)</code> <code>$(printf "%3d" $m_p)%</code>
DSK <code>$(draw_bar "$disk_p")</code> ${disk_p}% <b>DSK </b> <code>$(draw_bar $d_p)</code> <code>$(printf "%3d" $d_p)%</code>
<b>概览</b> <b>📊 详细概览:</b>
运行时间:<code>${up}</code> • <b>运行时间:</b> <code>$up</code>
负载(1/5/15)<code>${load}</code> • <b>平均负载:</b> <code>$load</code>
内存:<code>${mem_used}MB / ${mem_total}MB</code> • <b>内存详情:</b> <code>${m_usd}MB / ${m_tot}MB</code>
Swap<code>${swap_used}MB / ${swap_total}MB (${swap_p}%)</code> • <b>交换分区:</b> <code>${s_usd}MB / ${s_tot}MB (${s_p}%)</code>
根分区:<code>${disk_used} / ${disk_total}</code> <b>根分区:</b> <code>${d_usd} / ${d_tot}</code>
Inode<code>${inode_used} / ${inode_total} (${inode_p}%)</code> <b>Inode:</b> <code>$(get_inode)</code>
温度:<code>${temp}</code> <b>系统温度:</b> <code>$temp</code>
<b>最忙分区 Top 5</b> <b>🗄 繁忙分区 Top 5:</b>
<pre>${top_parts}</pre> <pre>$(get_top_partitions)</pre>
<b>CPU Top ${TOP_N}</b> <b>🔥 CPU Top 5 (PID/CPU/MEM/CMD):</b>
<pre>PID USER CPU MEM ELAPSED CMD <pre>$(get_top_procs %cpu)</pre>
${top_cpu}</pre> <b>🧠 MEM Top 5 (PID/CPU/MEM/CMD):</b>
<b>MEM Top ${TOP_N}</b> <pre>$(get_top_procs %mem)</pre>
<pre>PID USER CPU MEM ELAPSED CMD
${top_mem}</pre>
🕒 <i>$(date '+%Y-%m-%d %H:%M:%S')</i> 🕒 <i>$(date '+%Y-%m-%d %H:%M:%S')</i>
EOF EOF
} }
send_telegram() { send_telegram() {
local text="$1" local text="$1"
if [[ -z "$BOT_TOKEN" ]]; then [[ -z "$BOT_TOKEN" ]] && return 1
echo "Error: BOT_TOKEN is empty." >&2 curl -s -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
return 1
fi
curl -fsS -X POST "https://api.telegram.org/bot${BOT_TOKEN}/sendMessage" \
-d "chat_id=${CHAT_ID}" \ -d "chat_id=${CHAT_ID}" \
-d "message_thread_id=${TOPIC_ID}" \ -d "message_thread_id=${TOPIC_ID}" \
-d "parse_mode=HTML" \ -d "parse_mode=HTML" \
--data-urlencode "text=${text}" >/dev/null --data-urlencode "text=${text}" > /dev/null
} }
report=$(build_report) send_telegram "$(build_report)"
send_telegram "$report"
printf '%s\n' "$report"