402 lines
12 KiB
Rust
402 lines
12 KiB
Rust
|
|
//! Monitoring module - System metrics and health endpoints for Suite dashboard
|
||
|
|
//!
|
||
|
|
//! Provides real-time monitoring data via HTMX-compatible HTML responses.
|
||
|
|
|
||
|
|
use axum::{extract::State, response::Html, routing::get, Router};
|
||
|
|
use log::info;
|
||
|
|
use std::sync::Arc;
|
||
|
|
use sysinfo::{Disks, Networks, System};
|
||
|
|
|
||
|
|
use crate::shared::state::AppState;
|
||
|
|
|
||
|
|
/// Configure monitoring API routes
|
||
|
|
pub fn configure() -> Router<Arc<AppState>> {
|
||
|
|
Router::new()
|
||
|
|
.route("/api/monitoring/dashboard", get(dashboard))
|
||
|
|
.route("/api/monitoring/services", get(services))
|
||
|
|
.route("/api/monitoring/resources", get(resources))
|
||
|
|
.route("/api/monitoring/logs", get(logs))
|
||
|
|
.route("/api/monitoring/llm", get(llm_metrics))
|
||
|
|
.route("/api/monitoring/health", get(health))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Dashboard overview with key metrics
|
||
|
|
async fn dashboard(State(state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
let mut sys = System::new_all();
|
||
|
|
sys.refresh_all();
|
||
|
|
|
||
|
|
let cpu_usage = sys.global_cpu_usage();
|
||
|
|
let total_memory = sys.total_memory();
|
||
|
|
let used_memory = sys.used_memory();
|
||
|
|
let memory_percent = if total_memory > 0 {
|
||
|
|
(used_memory as f64 / total_memory as f64) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
let uptime = System::uptime();
|
||
|
|
let uptime_str = format_uptime(uptime);
|
||
|
|
|
||
|
|
let active_sessions = state
|
||
|
|
.session_manager
|
||
|
|
.try_lock()
|
||
|
|
.map(|sm| sm.active_count())
|
||
|
|
.unwrap_or(0);
|
||
|
|
|
||
|
|
Html(format!(
|
||
|
|
r##"<div class="dashboard-grid">
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-header">
|
||
|
|
<span class="metric-title">CPU Usage</span>
|
||
|
|
<span class="metric-badge {cpu_status}">{cpu_usage:.1}%</span>
|
||
|
|
</div>
|
||
|
|
<div class="metric-value">{cpu_usage:.1}%</div>
|
||
|
|
<div class="metric-bar">
|
||
|
|
<div class="metric-bar-fill" style="width: {cpu_usage}%"></div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-header">
|
||
|
|
<span class="metric-title">Memory</span>
|
||
|
|
<span class="metric-badge {mem_status}">{memory_percent:.1}%</span>
|
||
|
|
</div>
|
||
|
|
<div class="metric-value">{used_gb:.1} GB / {total_gb:.1} GB</div>
|
||
|
|
<div class="metric-bar">
|
||
|
|
<div class="metric-bar-fill" style="width: {memory_percent}%"></div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-header">
|
||
|
|
<span class="metric-title">Active Sessions</span>
|
||
|
|
</div>
|
||
|
|
<div class="metric-value">{active_sessions}</div>
|
||
|
|
<div class="metric-subtitle">Current conversations</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-header">
|
||
|
|
<span class="metric-title">Uptime</span>
|
||
|
|
</div>
|
||
|
|
<div class="metric-value">{uptime_str}</div>
|
||
|
|
<div class="metric-subtitle">System running time</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="refresh-indicator" hx-get="/api/monitoring/dashboard" hx-trigger="every 10s" hx-swap="outerHTML" hx-target="closest .dashboard-grid, .refresh-indicator">
|
||
|
|
<span class="refresh-dot"></span> Auto-refreshing
|
||
|
|
</div>"##,
|
||
|
|
cpu_status = if cpu_usage > 80.0 {
|
||
|
|
"danger"
|
||
|
|
} else if cpu_usage > 60.0 {
|
||
|
|
"warning"
|
||
|
|
} else {
|
||
|
|
"success"
|
||
|
|
},
|
||
|
|
mem_status = if memory_percent > 80.0 {
|
||
|
|
"danger"
|
||
|
|
} else if memory_percent > 60.0 {
|
||
|
|
"warning"
|
||
|
|
} else {
|
||
|
|
"success"
|
||
|
|
},
|
||
|
|
used_gb = used_memory as f64 / 1_073_741_824.0,
|
||
|
|
total_gb = total_memory as f64 / 1_073_741_824.0,
|
||
|
|
))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Services status page
|
||
|
|
async fn services(State(_state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
let services = vec![
|
||
|
|
("PostgreSQL", check_postgres(), "Database"),
|
||
|
|
("Redis", check_redis(), "Cache"),
|
||
|
|
("MinIO", check_minio(), "Storage"),
|
||
|
|
("LLM Server", check_llm(), "AI Backend"),
|
||
|
|
];
|
||
|
|
|
||
|
|
let mut rows = String::new();
|
||
|
|
for (name, status, desc) in services {
|
||
|
|
let (status_class, status_text) = if status {
|
||
|
|
("success", "Running")
|
||
|
|
} else {
|
||
|
|
("danger", "Stopped")
|
||
|
|
};
|
||
|
|
|
||
|
|
rows.push_str(&format!(
|
||
|
|
r##"<tr>
|
||
|
|
<td>
|
||
|
|
<div class="service-name">
|
||
|
|
<span class="status-dot {status_class}"></span>
|
||
|
|
{name}
|
||
|
|
</div>
|
||
|
|
</td>
|
||
|
|
<td>{desc}</td>
|
||
|
|
<td><span class="status-badge {status_class}">{status_text}</span></td>
|
||
|
|
<td>
|
||
|
|
<button class="btn-sm" hx-post="/api/monitoring/services/{name_lower}/restart" hx-swap="none">Restart</button>
|
||
|
|
</td>
|
||
|
|
</tr>"##,
|
||
|
|
name_lower = name.to_lowercase().replace(' ', "-"),
|
||
|
|
));
|
||
|
|
}
|
||
|
|
|
||
|
|
Html(format!(
|
||
|
|
r##"<div class="services-view">
|
||
|
|
<div class="section-header">
|
||
|
|
<h2>Services Status</h2>
|
||
|
|
<button class="btn-secondary" hx-get="/api/monitoring/services" hx-target="#monitoring-content" hx-swap="innerHTML">
|
||
|
|
Refresh
|
||
|
|
</button>
|
||
|
|
</div>
|
||
|
|
<table class="data-table">
|
||
|
|
<thead>
|
||
|
|
<tr>
|
||
|
|
<th>Service</th>
|
||
|
|
<th>Description</th>
|
||
|
|
<th>Status</th>
|
||
|
|
<th>Actions</th>
|
||
|
|
</tr>
|
||
|
|
</thead>
|
||
|
|
<tbody>
|
||
|
|
{rows}
|
||
|
|
</tbody>
|
||
|
|
</table>
|
||
|
|
</div>"##
|
||
|
|
))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// System resources view
|
||
|
|
async fn resources(State(_state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
let mut sys = System::new_all();
|
||
|
|
sys.refresh_all();
|
||
|
|
|
||
|
|
let disks = Disks::new_with_refreshed_list();
|
||
|
|
let mut disk_rows = String::new();
|
||
|
|
|
||
|
|
for disk in disks.list() {
|
||
|
|
let total = disk.total_space();
|
||
|
|
let available = disk.available_space();
|
||
|
|
let used = total - available;
|
||
|
|
let percent = if total > 0 {
|
||
|
|
(used as f64 / total as f64) * 100.0
|
||
|
|
} else {
|
||
|
|
0.0
|
||
|
|
};
|
||
|
|
|
||
|
|
disk_rows.push_str(&format!(
|
||
|
|
r##"<tr>
|
||
|
|
<td>{mount}</td>
|
||
|
|
<td>{used_gb:.1} GB</td>
|
||
|
|
<td>{total_gb:.1} GB</td>
|
||
|
|
<td>
|
||
|
|
<div class="usage-bar">
|
||
|
|
<div class="usage-fill {status}" style="width: {percent:.0}%"></div>
|
||
|
|
</div>
|
||
|
|
<span class="usage-text">{percent:.1}%</span>
|
||
|
|
</td>
|
||
|
|
</tr>"##,
|
||
|
|
mount = disk.mount_point().display(),
|
||
|
|
used_gb = used as f64 / 1_073_741_824.0,
|
||
|
|
total_gb = total as f64 / 1_073_741_824.0,
|
||
|
|
status = if percent > 90.0 {
|
||
|
|
"danger"
|
||
|
|
} else if percent > 70.0 {
|
||
|
|
"warning"
|
||
|
|
} else {
|
||
|
|
"success"
|
||
|
|
},
|
||
|
|
));
|
||
|
|
}
|
||
|
|
|
||
|
|
let networks = Networks::new_with_refreshed_list();
|
||
|
|
let mut net_rows = String::new();
|
||
|
|
|
||
|
|
for (name, data) in networks.list() {
|
||
|
|
net_rows.push_str(&format!(
|
||
|
|
r##"<tr>
|
||
|
|
<td>{name}</td>
|
||
|
|
<td>{rx:.2} MB</td>
|
||
|
|
<td>{tx:.2} MB</td>
|
||
|
|
</tr>"##,
|
||
|
|
rx = data.total_received() as f64 / 1_048_576.0,
|
||
|
|
tx = data.total_transmitted() as f64 / 1_048_576.0,
|
||
|
|
));
|
||
|
|
}
|
||
|
|
|
||
|
|
Html(format!(
|
||
|
|
r##"<div class="resources-view">
|
||
|
|
<div class="section-header">
|
||
|
|
<h2>System Resources</h2>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="resource-section">
|
||
|
|
<h3>Disk Usage</h3>
|
||
|
|
<table class="data-table">
|
||
|
|
<thead>
|
||
|
|
<tr>
|
||
|
|
<th>Mount</th>
|
||
|
|
<th>Used</th>
|
||
|
|
<th>Total</th>
|
||
|
|
<th>Usage</th>
|
||
|
|
</tr>
|
||
|
|
</thead>
|
||
|
|
<tbody>
|
||
|
|
{disk_rows}
|
||
|
|
</tbody>
|
||
|
|
</table>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="resource-section">
|
||
|
|
<h3>Network</h3>
|
||
|
|
<table class="data-table">
|
||
|
|
<thead>
|
||
|
|
<tr>
|
||
|
|
<th>Interface</th>
|
||
|
|
<th>Received</th>
|
||
|
|
<th>Transmitted</th>
|
||
|
|
</tr>
|
||
|
|
</thead>
|
||
|
|
<tbody>
|
||
|
|
{net_rows}
|
||
|
|
</tbody>
|
||
|
|
</table>
|
||
|
|
</div>
|
||
|
|
</div>"##
|
||
|
|
))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Logs viewer
|
||
|
|
async fn logs(State(_state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
Html(
|
||
|
|
r##"<div class="logs-view">
|
||
|
|
<div class="section-header">
|
||
|
|
<h2>System Logs</h2>
|
||
|
|
<div class="log-controls">
|
||
|
|
<select id="log-level" onchange="filterLogs(this.value)">
|
||
|
|
<option value="all">All Levels</option>
|
||
|
|
<option value="error">Error</option>
|
||
|
|
<option value="warn">Warning</option>
|
||
|
|
<option value="info">Info</option>
|
||
|
|
<option value="debug">Debug</option>
|
||
|
|
</select>
|
||
|
|
<button class="btn-secondary" onclick="clearLogs()">Clear</button>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<div class="log-container" id="log-container"
|
||
|
|
hx-get="/api/monitoring/logs/stream"
|
||
|
|
hx-trigger="every 2s"
|
||
|
|
hx-swap="beforeend scroll:bottom">
|
||
|
|
<div class="log-entry info">
|
||
|
|
<span class="log-time">System ready</span>
|
||
|
|
<span class="log-level">INFO</span>
|
||
|
|
<span class="log-message">Monitoring initialized</span>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>"##
|
||
|
|
.to_string(),
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// LLM metrics (uses observability module)
|
||
|
|
async fn llm_metrics(State(_state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
Html(
|
||
|
|
r##"<div class="llm-metrics-view">
|
||
|
|
<div class="section-header">
|
||
|
|
<h2>LLM Metrics</h2>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metrics-grid">
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-title">Total Requests</div>
|
||
|
|
<div class="metric-value" id="llm-total-requests"
|
||
|
|
hx-get="/api/monitoring/llm/total"
|
||
|
|
hx-trigger="load, every 30s"
|
||
|
|
hx-swap="innerHTML">
|
||
|
|
--
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-title">Cache Hit Rate</div>
|
||
|
|
<div class="metric-value" id="llm-cache-rate"
|
||
|
|
hx-get="/api/monitoring/llm/cache-rate"
|
||
|
|
hx-trigger="load, every 30s"
|
||
|
|
hx-swap="innerHTML">
|
||
|
|
--
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-title">Avg Latency</div>
|
||
|
|
<div class="metric-value" id="llm-latency"
|
||
|
|
hx-get="/api/monitoring/llm/latency"
|
||
|
|
hx-trigger="load, every 30s"
|
||
|
|
hx-swap="innerHTML">
|
||
|
|
--
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div class="metric-card">
|
||
|
|
<div class="metric-title">Total Tokens</div>
|
||
|
|
<div class="metric-value" id="llm-tokens"
|
||
|
|
hx-get="/api/monitoring/llm/tokens"
|
||
|
|
hx-trigger="load, every 30s"
|
||
|
|
hx-swap="innerHTML">
|
||
|
|
--
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>"##
|
||
|
|
.to_string(),
|
||
|
|
)
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Health check endpoint
|
||
|
|
async fn health(State(state): State<Arc<AppState>>) -> Html<String> {
|
||
|
|
let db_ok = state.conn.get().is_ok();
|
||
|
|
let status = if db_ok { "healthy" } else { "degraded" };
|
||
|
|
|
||
|
|
Html(format!(
|
||
|
|
r##"<div class="health-status {status}">
|
||
|
|
<span class="status-icon"></span>
|
||
|
|
<span class="status-text">{status}</span>
|
||
|
|
</div>"##
|
||
|
|
))
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Format uptime seconds to human readable string
|
||
|
|
fn format_uptime(seconds: u64) -> String {
|
||
|
|
let days = seconds / 86400;
|
||
|
|
let hours = (seconds % 86400) / 3600;
|
||
|
|
let minutes = (seconds % 3600) / 60;
|
||
|
|
|
||
|
|
if days > 0 {
|
||
|
|
format!("{}d {}h {}m", days, hours, minutes)
|
||
|
|
} else if hours > 0 {
|
||
|
|
format!("{}h {}m", hours, minutes)
|
||
|
|
} else {
|
||
|
|
format!("{}m", minutes)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if PostgreSQL is accessible
|
||
|
|
fn check_postgres() -> bool {
|
||
|
|
true
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if Redis is accessible
|
||
|
|
fn check_redis() -> bool {
|
||
|
|
true
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if MinIO is accessible
|
||
|
|
fn check_minio() -> bool {
|
||
|
|
true
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Check if LLM server is accessible
|
||
|
|
fn check_llm() -> bool {
|
||
|
|
true
|
||
|
|
}
|