From eea9b24ef04ece9c87da9e1ad9d005d09c60ca2d Mon Sep 17 00:00:00 2001 From: "Rodrigo Rodriguez (Pragmatismo)" Date: Wed, 22 Apr 2026 11:25:05 +0000 Subject: [PATCH] fix: CI failures - shutdown hang, bottest compile errors, botui deploy - Add shutdown tracing and 15s forced exit to prevent SIGTERM hangs - Fix E0583: remove self-referential mod declarations in bottest integration files - Fix E0599: correct .status() call on Result in performance.rs - Fix botui CI deploy: use systemctl stop/start instead of pkill+nohup - Update PROD.md with DB-driven CI log retrieval method --- .forgejo/workflows/botui.yaml | 18 ++-- PROD.md | 82 +++++++++++++++++-- botserver/src/main_module/server.rs | 14 +++- botserver/src/main_module/shutdown.rs | 18 +++- bottest/tests/integration/accessibility.rs | 2 - bottest/tests/integration/compliance.rs | 2 - .../tests/integration/internationalization.rs | 2 - bottest/tests/integration/performance.rs | 4 +- bottest/tests/integration/security.rs | 2 - 9 files changed, 111 insertions(+), 33 deletions(-) diff --git a/.forgejo/workflows/botui.yaml b/.forgejo/workflows/botui.yaml index 703efa72..c3908a13 100644 --- a/.forgejo/workflows/botui.yaml +++ b/.forgejo/workflows/botui.yaml @@ -23,12 +23,12 @@ jobs: run: | cd /opt/gbo/work/generalbots CARGO_BUILD_JOBS=4 cargo build -p botui --bin botui - - name: Deploy - run: | - BINARY=/opt/gbo/work/target/debug/botui - ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pkill -x botui || true" - sleep 2 - scp -o StrictHostKeyChecking=no "$BINARY" $SYSTEM_HOST:/opt/gbo/bin/botui - ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "cd /opt/gbo/bin && RUST_LOG=info nohup ./botui >> /opt/gbo/logs/stdout.log 2>> /opt/gbo/logs/stderr.log &" - sleep 3 - ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pgrep -x botui && echo 'BotUI Deployed' || echo 'Failed'" + - name: Deploy + run: | + BINARY=/opt/gbo/work/target/debug/botui + ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "sudo systemctl stop botui" + sleep 2 + scp -o StrictHostKeyChecking=no "$BINARY" $SYSTEM_HOST:/opt/gbo/bin/botui + ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "sudo systemctl start botui" + sleep 3 + ssh -o StrictHostKeyChecking=no $SYSTEM_HOST "pgrep -x botui && echo 'BotUI Deployed' || echo 'Failed'" diff --git a/PROD.md b/PROD.md index 2d5e9782..6cf8a651 100644 --- a/PROD.md +++ b/PROD.md @@ -385,15 +385,85 @@ curl -X DELETE "http://:8080/v2/users/" \ | List users | `POST /v2/users` | | Update password | `POST /v2/users/{id}/password` | -# /tmp permission denied for build.log -sudo incus exec alm-ci -- chmod 1777 /tmp -sudo incus exec alm-ci -- touch /tmp/build.log && chmod 666 /tmp/build.log +### CI/CD Log Retrieval from Database (PREFERRED METHOD) -# Clean old CI runs (keep recent) -sudo incus exec tables -- bash -c 'export PGPASSWORD=; psql -h localhost -U postgres -d PROD-ALM -c "DELETE FROM action_run WHERE id < ;"' -sudo incus exec tables -- bash -c 'export PGPASSWORD=; psql -h localhost -U postgres -d PROD-ALM -c "DELETE FROM action_run_job WHERE run_id < ;"' +The most reliable way to get CI build logs — including compiler errors — is from the Forgejo ALM database and compressed log files. The runner logs (`/opt/gbo/logs/forgejo-runner.log`) show live activity but scroll away quickly. The database retains everything. + +**Status codes:** 0=pending, 1=success, 2=failure, 3=cancelled, 6=running + +**Step 1 — List recent runs with workflow name and status:** +```sql +-- Connect to ALM database +sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM + +SELECT ar.id, ar.title, ar.workflow_id, ar.status, + to_timestamp(ar.created) AS created_at +FROM action_run ar +ORDER BY ar.id DESC LIMIT 10; ``` +**Step 2 — Get job/task IDs for a failed run:** +```sql +SELECT arj.id AS job_id, arj.name, arj.status, arj.task_id +FROM action_run_job arj +WHERE arj.run_id = ; +``` + +**Step 3 — Get step-level status (which step failed):** +```sql +SELECT ats.name, ats.status, ats.log_index, ats.log_length +FROM action_task_step ats +WHERE ats.task_id = +ORDER BY ats.index; +``` + +**Step 4 — Read the full build log (contains compiler errors):** +```bash +# 1. Get the log filename from action_task +sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM \ + -c "SELECT log_filename FROM action_task WHERE id = ;" + +# 2. Pull and decompress the log from alm container +# Log files are zstd-compressed at: /opt/gbo/data/data/actions_log//.log.zst +sudo incus file pull alm/opt/gbo/data/data/actions_log/ /tmp/ci-log.log.zst +zstd -d /tmp/ci-log.log.zst -o /tmp/ci-log.log +cat /tmp/ci-log.log +``` + +**One-liner to read latest failed run log:** +```bash +TASK_ID=$(sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM -t -c \ + "SELECT at.id FROM action_task at JOIN action_run_job arj ON at.job_id = arj.id \ + JOIN action_run ar ON arj.run_id = ar.id \ + WHERE ar.status = 2 ORDER BY at.id DESC LIMIT 1;" | tr -d ' ') +LOG_FILE=$(sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM -t -c \ + "SELECT log_filename FROM action_task WHERE id = $TASK_ID;" | tr -d ' ') +sudo incus file pull "alm/opt/gbo/data/data/actions_log/$LOG_FILE" /tmp/ci-log.log.zst +zstd -d /tmp/ci-log.log.zst -o /tmp/ci-log.log 2>/dev/null && cat /tmp/ci-log.log +``` + +**Watch CI in real-time (supplementary):** +```bash +# Tail runner logs (live but ephemeral) +sudo incus exec alm-ci -- tail -f /opt/gbo/logs/forgejo-runner.log + +# Watch for new runs +sudo incus exec tables -- psql -h localhost -U postgres -d PROD-ALM \ + -c "SELECT id, title, workflow_id, status FROM action_run ORDER BY id DESC LIMIT 5;" +``` + +**Verify binary was updated after deploy:** +```bash +sudo incus exec system -- stat -c '%y' /opt/gbo/bin/botserver +sudo incus exec system -- systemctl status botserver --no-pager +curl -sf https:///api/health && echo "OK" || echo "FAILED" +``` + +**Understand build timing:** +- **Rust compilation**: 2-5 minutes (cold build), 30-60 seconds (incremental) +- **Deploy step**: ~5 seconds +- **Total CI time**: 2-6 minutes depending on cache + **Watch CI in real-time:** ```bash # Tail runner logs diff --git a/botserver/src/main_module/server.rs b/botserver/src/main_module/server.rs index 7430289a..de8660d6 100644 --- a/botserver/src/main_module/server.rs +++ b/botserver/src/main_module/server.rs @@ -626,8 +626,9 @@ pub async fn run_axum_server( tokio::spawn(async move { shutdown_signal().await; - info!("Shutting down HTTPS server..."); + info!("Shutting down HTTPS server - draining active connections (10s timeout)..."); handle_clone.graceful_shutdown(Some(std::time::Duration::from_secs(10))); + info!("HTTPS graceful shutdown initiated, waiting for connections to drain..."); }); axum_server::bind_rustls(addr, tls_config) @@ -656,9 +657,14 @@ pub async fn run_axum_server( } }; info!("HTTP server listening on {}", addr); - axum::serve(listener, app.into_make_service()) + info!("Server ready - shutdown via SIGINT (Ctrl+C) or SIGTERM (systemctl stop)"); + let result = axum::serve(listener, app.into_make_service()) .with_graceful_shutdown(shutdown_signal()) - .await - .map_err(std::io::Error::other) + .await; + match &result { + Ok(()) => info!("HTTP server shut down gracefully"), + Err(e) => error!("HTTP server shutdown with error: {}", e), + } + result.map_err(std::io::Error::other) } } diff --git a/botserver/src/main_module/shutdown.rs b/botserver/src/main_module/shutdown.rs index e1e296d1..6b9266df 100644 --- a/botserver/src/main_module/shutdown.rs +++ b/botserver/src/main_module/shutdown.rs @@ -1,6 +1,6 @@ //! Shutdown signal handling -use log::{error, info}; +use log::{error, info, warn}; pub fn print_shutdown_message() { println!(); @@ -9,6 +9,8 @@ pub fn print_shutdown_message() { } pub async fn shutdown_signal() { + info!("Shutdown signal handler installed, waiting for SIGINT or SIGTERM..."); + let ctrl_c = async { if let Err(e) = tokio::signal::ctrl_c().await { error!("Failed to install Ctrl+C handler: {}", e); @@ -19,6 +21,7 @@ pub async fn shutdown_signal() { let terminate = async { match tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) { Ok(mut signal) => { + info!("SIGTERM handler installed successfully"); signal.recv().await; } Err(e) => { @@ -32,12 +35,21 @@ pub async fn shutdown_signal() { tokio::select! { _ = ctrl_c => { - info!("Received Ctrl+C, initiating graceful shutdown..."); + info!("Received SIGINT (Ctrl+C), initiating graceful shutdown..."); } _ = terminate => { - info!("Received SIGTERM, initiating graceful shutdown..."); + info!("Received SIGTERM (systemctl stop), initiating graceful shutdown..."); } } + info!("Shutdown signal received - server will stop accepting new connections"); + warn!("Graceful shutdown timeout is 10s for HTTPS, after which process will exit"); + print_shutdown_message(); + + tokio::spawn(async { + tokio::time::sleep(std::time::Duration::from_secs(15)).await; + warn!("Graceful shutdown exceeded 15s - forcing process exit to prevent hang"); + std::process::exit(0); + }); } diff --git a/bottest/tests/integration/accessibility.rs b/bottest/tests/integration/accessibility.rs index d07c969a..698e057b 100644 --- a/bottest/tests/integration/accessibility.rs +++ b/bottest/tests/integration/accessibility.rs @@ -1,5 +1,3 @@ -mod accessibility; - use bottest::prelude::*; use reqwest::Client; use serde_json::json; diff --git a/bottest/tests/integration/compliance.rs b/bottest/tests/integration/compliance.rs index f23eca37..9d5614db 100644 --- a/bottest/tests/integration/compliance.rs +++ b/bottest/tests/integration/compliance.rs @@ -1,5 +1,3 @@ -mod compliance; - use bottest::prelude::*; use reqwest::Client; use serde_json::json; diff --git a/bottest/tests/integration/internationalization.rs b/bottest/tests/integration/internationalization.rs index fad044da..010252eb 100644 --- a/bottest/tests/integration/internationalization.rs +++ b/bottest/tests/integration/internationalization.rs @@ -1,5 +1,3 @@ -mod internationalization; - use bottest::prelude::*; use reqwest::Client; use serde_json::json; diff --git a/bottest/tests/integration/performance.rs b/bottest/tests/integration/performance.rs index f8db88a7..a82c09eb 100644 --- a/bottest/tests/integration/performance.rs +++ b/bottest/tests/integration/performance.rs @@ -1,5 +1,3 @@ -mod performance; - use bottest::prelude::*; use reqwest::Client; use std::time::{Duration, Instant}; @@ -71,7 +69,7 @@ async fn test_concurrent_requests_handled() { let successes = results .iter() - .filter(|r| r.as_ref().map(|resp| resp.status().is_success()).unwrap_or(false)) + .filter(|r| r.as_ref().is_ok_and(|resp| resp.status().is_success())) .count(); assert!( diff --git a/bottest/tests/integration/security.rs b/bottest/tests/integration/security.rs index 01deb04f..ce640d29 100644 --- a/bottest/tests/integration/security.rs +++ b/bottest/tests/integration/security.rs @@ -1,5 +1,3 @@ -mod security; - use bottest::prelude::*; use reqwest::Client; use serde_json::json;