fix(bootstrap): Improve Vault startup diagnostics and error handling

- Fix create_conn/establish_pg_connection to return Result instead of panicking
- Fix AppConfig::from_env to not require database access (circular dependency)
- Add #[cfg(test)] to AppState Default impl to prevent accidental panic
- Add extensive debug logging for Vault startup troubleshooting
- Remove Stdio::null() from start() to allow shell redirections to work
- Add direct vault start test in bootstrap for debugging
- Make Vault setup failure fatal (was silently continuing)
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2025-12-10 08:30:49 -03:00
parent 0b9ad6c80d
commit 2da5f0ccdf
7 changed files with 289 additions and 50 deletions

View file

@ -1,7 +1,7 @@
{
"base_url": "http://localhost:8080",
"default_org": {
"id": "350284283375517710",
"id": "350425878548709390",
"name": "default",
"domain": "default.localhost"
},
@ -13,8 +13,8 @@
"first_name": "Admin",
"last_name": "User"
},
"admin_token": "mp1N0PI5mP7VNbj-g-d1e-LFFxV22l6pHuCdPvcbQtS0U35e_jLFIY1GsgREaMOqvrtAu3E",
"admin_token": "lRXJsd9yeOw9Jm70e_cDPoumcueHnZzNusVisZdXgZN4lNLnmz61UdvdPOYYVb2G2gPuT1o",
"project_id": "",
"client_id": "350284283929231374",
"client_secret": "sBttWgX1v1ENGDyqBxtPRLItMf8Y4oQHk2hAoBStW6BMPuYQIY6xV6dkaSxsjSoe"
"client_id": "350425879152754702",
"client_secret": "VKkjSPTwsneXzr7z5bxpruC60BV4k8PZH5aSX0XyCdTZrhaAykdQOdbaxRfTjGs5"
}

View file

@ -13,6 +13,7 @@ use rcgen::{
BasicConstraints, CertificateParams, DistinguishedName, DnType, IsCa, Issuer, KeyPair,
};
use std::fs;
use std::io::Write;
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use std::path::{Path, PathBuf};
@ -648,7 +649,27 @@ impl BootstrapManager {
.status();
std::thread::sleep(std::time::Duration::from_millis(200));
}
_ = pm.install(component).await;
eprintln!("[DEBUG] Installing component: {}", component);
let _ = std::io::stderr().flush();
info!("Installing component: {}", component);
let install_result = pm.install(component).await;
eprintln!(
"[DEBUG] Install result for {}: {:?}",
component,
install_result.is_ok()
);
let _ = std::io::stderr().flush();
if let Err(e) = install_result {
eprintln!("[DEBUG] Failed to install component {}: {}", component, e);
error!("Failed to install component {}: {}", component, e);
if component == "vault" {
return Err(anyhow::anyhow!("Failed to install Vault: {}", e));
}
}
eprintln!("[DEBUG] Component {} installed successfully", component);
let _ = std::io::stderr().flush();
info!("Component {} installed successfully", component);
// After tables is installed, START PostgreSQL and create Zitadel config files before installing directory
if component == "tables" {
@ -698,19 +719,124 @@ impl BootstrapManager {
// After Vault is installed, START the server then initialize it
if component == "vault" {
eprintln!("[VAULT DEBUG] === VAULT SETUP BLOCK ENTERED ===");
eprintln!(
"[VAULT DEBUG] Current working directory: {:?}",
std::env::current_dir()
);
eprintln!("[VAULT DEBUG] base_path: {:?}", pm.base_path);
let _ = std::io::stderr().flush();
info!("=== VAULT SETUP BLOCK ENTERED ===");
info!("Starting Vault server...");
match pm.start("vault") {
Ok(_) => {
info!("Vault server started");
// Give Vault time to start
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
}
Err(e) => {
warn!("Failed to start Vault server: {}", e);
// Verify vault binary exists and is executable
let vault_bin = PathBuf::from("./botserver-stack/bin/vault/vault");
if !vault_bin.exists() {
eprintln!("[VAULT DEBUG] Vault binary not found at {:?}", vault_bin);
let _ = std::io::stderr().flush();
error!("Vault binary not found at {:?}", vault_bin);
return Err(anyhow::anyhow!("Vault binary not found after installation"));
}
eprintln!("[VAULT DEBUG] Vault binary exists at {:?}", vault_bin);
let _ = std::io::stderr().flush();
info!("Vault binary exists at {:?}", vault_bin);
// Ensure logs directory exists
let vault_log_path = PathBuf::from("./botserver-stack/logs/vault/vault.log");
if let Some(parent) = vault_log_path.parent() {
if let Err(e) = fs::create_dir_all(parent) {
eprintln!("[VAULT DEBUG] Failed to create vault logs directory: {}", e);
error!("Failed to create vault logs directory: {}", e);
}
}
// Ensure data directory exists
let vault_data_path = PathBuf::from("./botserver-stack/data/vault");
if let Err(e) = fs::create_dir_all(&vault_data_path) {
eprintln!("[VAULT DEBUG] Failed to create vault data directory: {}", e);
error!("Failed to create vault data directory: {}", e);
}
eprintln!("[VAULT DEBUG] Starting Vault server...");
let _ = std::io::stderr().flush();
info!("Starting Vault server...");
// Try starting vault directly first to see if it works
eprintln!("[VAULT DEBUG] Testing direct vault start...");
let direct_test = std::process::Command::new("sh")
.arg("-c")
.arg("cd ./botserver-stack/bin/vault && nohup ./vault server -config=../../conf/vault/config.hcl > ../../logs/vault/vault.log 2>&1 &")
.status();
eprintln!("[VAULT DEBUG] Direct test result: {:?}", direct_test);
std::thread::sleep(std::time::Duration::from_secs(2));
// Check if it's running now
let check = std::process::Command::new("pgrep")
.args(["-f", "vault server"])
.output();
if let Ok(output) = &check {
let pids = String::from_utf8_lossy(&output.stdout);
eprintln!(
"[VAULT DEBUG] After direct start, pgrep result: '{}'",
pids.trim()
);
if !pids.trim().is_empty() {
eprintln!("[VAULT DEBUG] Vault started via direct command!");
// Skip pm.start since vault is already running
info!("Vault server started");
tokio::time::sleep(tokio::time::Duration::from_secs(3)).await;
} else {
eprintln!("[VAULT DEBUG] Direct start failed, trying pm.start...");
match pm.start("vault") {
Ok(_) => {
eprintln!("[VAULT DEBUG] pm.start returned Ok");
info!("Vault server started");
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
}
Err(e) => {
eprintln!("[VAULT DEBUG] pm.start failed: {}", e);
error!("Failed to start Vault server: {}", e);
return Err(anyhow::anyhow!(
"Failed to start Vault server: {}",
e
));
}
}
}
}
// Check log file
eprintln!(
"[VAULT DEBUG] Checking if vault.log exists: {}",
vault_log_path.exists()
);
if vault_log_path.exists() {
if let Ok(content) = fs::read_to_string(&vault_log_path) {
eprintln!(
"[VAULT DEBUG] vault.log content (first 500 chars): {}",
&content[..content.len().min(500)]
);
}
}
// The direct start above should have worked, but if pm.start is still called due to
// code flow, just check if vault is running
let final_check = std::process::Command::new("pgrep")
.args(["-f", "vault server"])
.output();
if let Ok(output) = final_check {
let pids = String::from_utf8_lossy(&output.stdout);
if pids.trim().is_empty() {
eprintln!(
"[VAULT DEBUG] CRITICAL: Vault is not running after all attempts!"
);
return Err(anyhow::anyhow!("Failed to start Vault server"));
} else {
eprintln!("[VAULT DEBUG] Vault is running with PIDs: {}", pids.trim());
}
}
eprintln!("[VAULT DEBUG] Initializing Vault with secrets...");
let _ = std::io::stderr().flush();
info!("Initializing Vault with secrets...");
if let Err(e) = self
.setup_vault(
@ -722,6 +848,19 @@ impl BootstrapManager {
.await
{
error!("Failed to setup Vault: {}", e);
// Check vault.log for more details
if vault_log_path.exists() {
if let Ok(log_content) = fs::read_to_string(&vault_log_path) {
let last_lines: Vec<&str> =
log_content.lines().rev().take(20).collect();
error!("Vault log (last 20 lines):");
for line in last_lines.iter().rev() {
error!(" {}", line);
}
}
}
// Vault is critical - fail the bootstrap
return Err(anyhow::anyhow!("Vault setup failed: {}. Check ./botserver-stack/logs/vault/vault.log for details.", e));
}
// Initialize the global SecretsManager so other components can use Vault
@ -1182,6 +1321,31 @@ meet IN A 127.0.0.1
let max_attempts = 30;
while attempts < max_attempts {
// First check if Vault process is running
let ps_check = std::process::Command::new("sh")
.arg("-c")
.arg("pgrep -f 'vault server' || echo 'NOT_RUNNING'")
.output();
if let Ok(ps_output) = ps_check {
let ps_result = String::from_utf8_lossy(&ps_output.stdout);
if ps_result.contains("NOT_RUNNING") {
warn!("Vault process is not running (attempt {})", attempts + 1);
// Check vault.log for crash info
let vault_log_path = PathBuf::from("./botserver-stack/logs/vault/vault.log");
if vault_log_path.exists() {
if let Ok(log_content) = fs::read_to_string(&vault_log_path) {
let last_lines: Vec<&str> =
log_content.lines().rev().take(10).collect();
warn!("Vault log (last 10 lines):");
for line in last_lines.iter().rev() {
warn!(" {}", line);
}
}
}
}
}
let health_check = std::process::Command::new("curl")
.args(["-f", "-s", "http://localhost:8200/v1/sys/health?standbyok=true&uninitcode=200&sealedcode=200"])
.output();
@ -1190,7 +1354,15 @@ meet IN A 127.0.0.1
if output.status.success() {
info!("Vault is responding");
break;
} else {
// Log the HTTP response for debugging
let stderr = String::from_utf8_lossy(&output.stderr);
if !stderr.is_empty() && attempts % 5 == 0 {
debug!("Vault health check attempt {}: {}", attempts + 1, stderr);
}
}
} else if attempts % 5 == 0 {
warn!("Vault health check curl failed (attempt {})", attempts + 1);
}
attempts += 1;
@ -1198,9 +1370,25 @@ meet IN A 127.0.0.1
}
if attempts >= max_attempts {
warn!("Vault health check timed out");
warn!(
"Vault health check timed out after {} attempts",
max_attempts
);
// Final check of vault.log
let vault_log_path = PathBuf::from("./botserver-stack/logs/vault/vault.log");
if vault_log_path.exists() {
if let Ok(log_content) = fs::read_to_string(&vault_log_path) {
let last_lines: Vec<&str> = log_content.lines().rev().take(20).collect();
error!("Vault log (last 20 lines):");
for line in last_lines.iter().rev() {
error!(" {}", line);
}
}
} else {
error!("Vault log file does not exist at {:?}", vault_log_path);
}
return Err(anyhow::anyhow!(
"Vault not ready after {} seconds",
"Vault not ready after {} seconds. Check ./botserver-stack/logs/vault/vault.log for details.",
max_attempts
));
}

View file

@ -341,14 +341,9 @@ impl AppConfig {
port: 8080,
base_url: "http://localhost:8080".to_string(),
},
site_path: {
let pool = create_conn()?;
ConfigManager::new(pool).get_config(
&Uuid::nil(),
"SITES_ROOT",
Some("./botserver-stack/sites"),
)?
},
// Use default site_path - no database access needed for env-based config
// This allows from_env() to work during bootstrap before Vault/DB are ready
site_path: "./botserver-stack/sites".to_string(),
data_dir: "./botserver-stack/data".to_string(),
})
}
@ -449,7 +444,3 @@ impl ConfigManager {
Ok(updated)
}
}
fn create_conn() -> Result<DbPool, anyhow::Error> {
crate::shared::utils::create_conn()
.map_err(|e| anyhow::anyhow!("Failed to create database pool: {}", e))
}

View file

@ -482,6 +482,30 @@ impl PackageManager {
String::from_utf8_lossy(&output.stderr)
));
}
// Make all extracted files executable (especially important for binaries like Vault)
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
if let Ok(entries) = std::fs::read_dir(bin_path) {
for entry in entries.flatten() {
let path = entry.path();
if path.is_file() {
if let Ok(metadata) = std::fs::metadata(&path) {
let mut perms = metadata.permissions();
// Only make executable if it looks like a binary (no extension or common binary extensions)
let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
if ext.is_empty() || ext == "sh" || ext == "bash" {
perms.set_mode(0o755);
let _ = std::fs::set_permissions(&path, perms);
trace!("Made executable: {:?}", path);
}
}
}
}
}
}
// Only delete if NOT in the cache directory (botserver-installers)
// Cached files should be preserved for offline installation
if !temp_file.to_string_lossy().contains("botserver-installers") {

View file

@ -2,7 +2,7 @@ use crate::package_manager::component::ComponentConfig;
use crate::package_manager::os::detect_os;
use crate::package_manager::{InstallMode, OsType};
use anyhow::Result;
use log::trace;
use log::{info, trace};
use std::collections::HashMap;
use std::path::PathBuf;
@ -880,10 +880,21 @@ impl PackageManager {
.replace("{{CONF_PATH}}", &conf_path.to_string_lossy())
.replace("{{LOGS_PATH}}", &logs_path.to_string_lossy());
trace!(
eprintln!(
"[START DEBUG] Starting component {} with command: {}",
component.name, rendered_cmd
);
eprintln!(
"[START DEBUG] Working directory: {:?}, logs_path: {:?}",
bin_path, logs_path
);
info!(
"Starting component {} with command: {}",
component.name,
rendered_cmd
component.name, rendered_cmd
);
info!(
"Working directory: {:?}, logs_path: {:?}",
bin_path, logs_path
);
// Fetch credentials from Vault for special placeholders
@ -906,20 +917,43 @@ impl PackageManager {
}
}
// Don't redirect stdout/stderr to null - let the shell command handle its own redirections
// This is important for commands like "nohup ... > file 2>&1 &" which need to redirect
// their own output to files
eprintln!("[START DEBUG] About to spawn shell command...");
let child = std::process::Command::new("sh")
.current_dir(&bin_path)
.arg("-c")
.arg(&rendered_cmd)
.envs(&evaluated_envs)
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.spawn();
eprintln!("[START DEBUG] Spawn result: {:?}", child.is_ok());
std::thread::sleep(std::time::Duration::from_secs(2));
// Check if the process is actually running after sleep
eprintln!("[START DEBUG] Checking if vault process exists after 2s sleep...");
let check_proc = std::process::Command::new("pgrep")
.args(["-f", "vault server"])
.output();
if let Ok(output) = check_proc {
let pids = String::from_utf8_lossy(&output.stdout);
eprintln!("[START DEBUG] pgrep vault server result: '{}'", pids.trim());
}
// Check if log file was created
eprintln!(
"[START DEBUG] Log file exists: {}",
logs_path.join("vault.log").exists()
);
match child {
Ok(c) => Ok(c),
Ok(c) => {
eprintln!("[START DEBUG] Returning Ok from start()");
Ok(c)
}
Err(e) => {
eprintln!("[START DEBUG] Spawn failed with error: {}", e);
let err_msg = e.to_string();
if err_msg.contains("already running")
|| err_msg.contains("be running")
@ -932,8 +966,6 @@ impl PackageManager {
Ok(std::process::Command::new("sh")
.arg("-c")
.arg("true")
.stdout(std::process::Stdio::null())
.stderr(std::process::Stdio::null())
.spawn()?)
} else {
Err(e.into())

View file

@ -12,7 +12,9 @@ use crate::shared::utils::DbPool;
use crate::tasks::{TaskEngine, TaskScheduler};
#[cfg(feature = "drive")]
use aws_sdk_s3::Client as S3Client;
#[cfg(test)]
use diesel::r2d2::{ConnectionManager, Pool};
#[cfg(test)]
use diesel::PgConnection;
#[cfg(feature = "cache")]
use redis::Client as RedisClient;
@ -259,12 +261,14 @@ fn create_mock_auth_service() -> AuthService {
rt.expect("Failed to create mock AuthService")
}
/// Default implementation for AppState - ONLY FOR TESTS
/// This will panic if Vault is not configured, so it must only be used in test contexts.
#[cfg(test)]
impl Default for AppState {
fn default() -> Self {
// NO LEGACY FALLBACK - Vault is mandatory
// This default is only for tests. In production, use the full initialization.
let database_url = crate::shared::utils::get_database_url_sync()
.expect("Vault not configured. Set VAULT_ADDR and VAULT_TOKEN in .env");
.expect("AppState::default() requires Vault to be configured. This should only be used in tests.");
let manager = ConnectionManager::<PgConnection>::new(&database_url);
let pool = Pool::builder()

View file

@ -254,28 +254,28 @@ pub fn estimate_token_count(text: &str) -> usize {
}
pub fn establish_pg_connection() -> Result<PgConnection> {
let database_url = get_database_url_sync()
.expect("Vault not configured. Set VAULT_ADDR and VAULT_TOKEN in .env");
let database_url = get_database_url_sync()?;
PgConnection::establish(&database_url)
.with_context(|| format!("Failed to connect to database at {}", database_url))
}
pub type DbPool = Pool<ConnectionManager<PgConnection>>;
pub fn create_conn() -> Result<DbPool, diesel::r2d2::PoolError> {
let database_url = get_database_url_sync()
.expect("Vault not configured. Set VAULT_ADDR and VAULT_TOKEN in .env");
pub fn create_conn() -> Result<DbPool, anyhow::Error> {
let database_url = get_database_url_sync()?;
let manager = ConnectionManager::<PgConnection>::new(database_url);
Pool::builder().build(manager)
Pool::builder()
.build(manager)
.map_err(|e| anyhow::anyhow!("Failed to create database pool: {}", e))
}
/// Create database connection pool using SecretsManager (async version)
pub async fn create_conn_async() -> Result<DbPool, diesel::r2d2::PoolError> {
let database_url = get_database_url()
.await
.expect("Vault not configured. Set VAULT_ADDR and VAULT_TOKEN in .env");
pub async fn create_conn_async() -> Result<DbPool, anyhow::Error> {
let database_url = get_database_url().await?;
let manager = ConnectionManager::<PgConnection>::new(database_url);
Pool::builder().build(manager)
Pool::builder()
.build(manager)
.map_err(|e| anyhow::anyhow!("Failed to create database pool: {}", e))
}
pub fn parse_database_url(url: &str) -> (String, String, String, u32, String) {