fix: CI failures - shutdown hang, bottest compile errors, botui deploy
Some checks failed
Botlib CI / build (push) Successful in 9s
BotServer CI / build (push) Successful in 3m52s
Bottest CI / build (push) Failing after 8s

- Add shutdown tracing and 15s forced exit to prevent SIGTERM hangs
- Fix E0583: remove self-referential mod declarations in bottest integration files
- Fix E0599: correct .status() call on Result in performance.rs
- Fix botui CI deploy: use systemctl stop/start instead of pkill+nohup
- Update PROD.md with DB-driven CI log retrieval method
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-04-22 11:25:05 +00:00
parent 1b25559a1b
commit eea9b24ef0
9 changed files with 111 additions and 33 deletions

View file

@ -23,12 +23,12 @@ jobs:
run: |
cd /opt/gbo/work/generalbots
CARGO_BUILD_JOBS=4 cargo build -p botui --bin botui
- name: Deploy
run: |
BINARY=/opt/gbo/work/target/debug/botui
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pkill -x botui || true"
sleep 2
scp -o StrictHostKeyChecking=no "$BINARY" $SYSTEM_HOST:/opt/gbo/bin/botui
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "cd /opt/gbo/bin && RUST_LOG=info nohup ./botui >> /opt/gbo/logs/stdout.log 2>> /opt/gbo/logs/stderr.log &"
sleep 3
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pgrep -x botui && echo 'BotUI Deployed' || echo 'Failed'"
- name: Deploy
run: |
BINARY=/opt/gbo/work/target/debug/botui
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "sudo systemctl stop botui"
sleep 2
scp -o StrictHostKeyChecking=no "$BINARY" $SYSTEM_HOST:/opt/gbo/bin/botui
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "sudo systemctl start botui"
sleep 3
ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pgrep -x botui && echo 'BotUI Deployed' || echo 'Failed'"

82
PROD.md
View file

@ -385,15 +385,85 @@ curl -X DELETE "http://<directory-ip>:8080/v2/users/<user-id>" \
| List users | `POST /v2/users` |
| Update password | `POST /v2/users/{id}/password` |
# /tmp permission denied for build.log
sudo incus exec alm-ci -- chmod 1777 /tmp
sudo incus exec alm-ci -- touch /tmp/build.log && chmod 666 /tmp/build.log
### CI/CD Log Retrieval from Database (PREFERRED METHOD)
# Clean old CI runs (keep recent)
sudo incus exec tables -- bash -c 'export PGPASSWORD=<postgres-password>; psql -h localhost -U postgres -d PROD-ALM -c "DELETE FROM action_run WHERE id < <RECENT_ID>;"'
sudo incus exec tables -- bash -c 'export PGPASSWORD=<postgres-password>; psql -h localhost -U postgres -d PROD-ALM -c "DELETE FROM action_run_job WHERE run_id < <RECENT_ID>;"'
The most reliable way to get CI build logs — including compiler errors — is from the Forgejo ALM database and compressed log files. The runner logs (`/opt/gbo/logs/forgejo-runner.log`) show live activity but scroll away quickly. The database retains everything.
**Status codes:** 0=pending, 1=success, 2=failure, 3=cancelled, 6=running
**Step 1 — List recent runs with workflow name and status:**
```sql
-- Connect to ALM database
sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM
SELECT ar.id, ar.title, ar.workflow_id, ar.status,
to_timestamp(ar.created) AS created_at
FROM action_run ar
ORDER BY ar.id DESC LIMIT 10;
```
**Step 2 — Get job/task IDs for a failed run:**
```sql
SELECT arj.id AS job_id, arj.name, arj.status, arj.task_id
FROM action_run_job arj
WHERE arj.run_id = <FAILED_RUN_ID>;
```
**Step 3 — Get step-level status (which step failed):**
```sql
SELECT ats.name, ats.status, ats.log_index, ats.log_length
FROM action_task_step ats
WHERE ats.task_id = <TASK_ID>
ORDER BY ats.index;
```
**Step 4 — Read the full build log (contains compiler errors):**
```bash
# 1. Get the log filename from action_task
sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM \
-c "SELECT log_filename FROM action_task WHERE id = <TASK_ID>;"
# 2. Pull and decompress the log from alm container
# Log files are zstd-compressed at: /opt/gbo/data/data/actions_log/<repo-path>/<task_id>.log.zst
sudo incus file pull alm/opt/gbo/data/data/actions_log/<LOG_FILENAME> /tmp/ci-log.log.zst
zstd -d /tmp/ci-log.log.zst -o /tmp/ci-log.log
cat /tmp/ci-log.log
```
**One-liner to read latest failed run log:**
```bash
TASK_ID=$(sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM -t -c \
"SELECT at.id FROM action_task at JOIN action_run_job arj ON at.job_id = arj.id \
JOIN action_run ar ON arj.run_id = ar.id \
WHERE ar.status = 2 ORDER BY at.id DESC LIMIT 1;" | tr -d ' ')
LOG_FILE=$(sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM -t -c \
"SELECT log_filename FROM action_task WHERE id = $TASK_ID;" | tr -d ' ')
sudo incus file pull "alm/opt/gbo/data/data/actions_log/$LOG_FILE" /tmp/ci-log.log.zst
zstd -d /tmp/ci-log.log.zst -o /tmp/ci-log.log 2>/dev/null && cat /tmp/ci-log.log
```
**Watch CI in real-time (supplementary):**
```bash
# Tail runner logs (live but ephemeral)
sudo incus exec alm-ci -- tail -f /opt/gbo/logs/forgejo-runner.log
# Watch for new runs
sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM \
-c "SELECT id, title, workflow_id, status FROM action_run ORDER BY id DESC LIMIT 5;"
```
**Verify binary was updated after deploy:**
```bash
sudo incus exec system -- stat -c '%y' /opt/gbo/bin/botserver
sudo incus exec system -- systemctl status botserver --no-pager
curl -sf https://<system-domain>/api/health && echo "OK" || echo "FAILED"
```
**Understand build timing:**
- **Rust compilation**: 2-5 minutes (cold build), 30-60 seconds (incremental)
- **Deploy step**: ~5 seconds
- **Total CI time**: 2-6 minutes depending on cache
**Watch CI in real-time:**
```bash
# Tail runner logs

View file

@ -626,8 +626,9 @@ pub async fn run_axum_server(
tokio::spawn(async move {
shutdown_signal().await;
info!("Shutting down HTTPS server...");
info!("Shutting down HTTPS server - draining active connections (10s timeout)...");
handle_clone.graceful_shutdown(Some(std::time::Duration::from_secs(10)));
info!("HTTPS graceful shutdown initiated, waiting for connections to drain...");
});
axum_server::bind_rustls(addr, tls_config)
@ -656,9 +657,14 @@ pub async fn run_axum_server(
}
};
info!("HTTP server listening on {}", addr);
axum::serve(listener, app.into_make_service())
info!("Server ready - shutdown via SIGINT (Ctrl+C) or SIGTERM (systemctl stop)");
let result = axum::serve(listener, app.into_make_service())
.with_graceful_shutdown(shutdown_signal())
.await
.map_err(std::io::Error::other)
.await;
match &result {
Ok(()) => info!("HTTP server shut down gracefully"),
Err(e) => error!("HTTP server shutdown with error: {}", e),
}
result.map_err(std::io::Error::other)
}
}

View file

@ -1,6 +1,6 @@
//! Shutdown signal handling
use log::{error, info};
use log::{error, info, warn};
pub fn print_shutdown_message() {
println!();
@ -9,6 +9,8 @@ pub fn print_shutdown_message() {
}
pub async fn shutdown_signal() {
info!("Shutdown signal handler installed, waiting for SIGINT or SIGTERM...");
let ctrl_c = async {
if let Err(e) = tokio::signal::ctrl_c().await {
error!("Failed to install Ctrl+C handler: {}", e);
@ -19,6 +21,7 @@ pub async fn shutdown_signal() {
let terminate = async {
match tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) {
Ok(mut signal) => {
info!("SIGTERM handler installed successfully");
signal.recv().await;
}
Err(e) => {
@ -32,12 +35,21 @@ pub async fn shutdown_signal() {
tokio::select! {
_ = ctrl_c => {
info!("Received Ctrl+C, initiating graceful shutdown...");
info!("Received SIGINT (Ctrl+C), initiating graceful shutdown...");
}
_ = terminate => {
info!("Received SIGTERM, initiating graceful shutdown...");
info!("Received SIGTERM (systemctl stop), initiating graceful shutdown...");
}
}
info!("Shutdown signal received - server will stop accepting new connections");
warn!("Graceful shutdown timeout is 10s for HTTPS, after which process will exit");
print_shutdown_message();
tokio::spawn(async {
tokio::time::sleep(std::time::Duration::from_secs(15)).await;
warn!("Graceful shutdown exceeded 15s - forcing process exit to prevent hang");
std::process::exit(0);
});
}

View file

@ -1,5 +1,3 @@
mod accessibility;
use bottest::prelude::*;
use reqwest::Client;
use serde_json::json;

View file

@ -1,5 +1,3 @@
mod compliance;
use bottest::prelude::*;
use reqwest::Client;
use serde_json::json;

View file

@ -1,5 +1,3 @@
mod internationalization;
use bottest::prelude::*;
use reqwest::Client;
use serde_json::json;

View file

@ -1,5 +1,3 @@
mod performance;
use bottest::prelude::*;
use reqwest::Client;
use std::time::{Duration, Instant};
@ -71,7 +69,7 @@ async fn test_concurrent_requests_handled() {
let successes = results
.iter()
.filter(|r| r.as_ref().map(|resp| resp.status().is_success()).unwrap_or(false))
.filter(|r| r.as_ref().is_ok_and(|resp| resp.status().is_success()))
.count();
assert!(

View file

@ -1,5 +1,3 @@
mod security;
use bottest::prelude::*;
use reqwest::Client;
use serde_json::json;