Add smart sleep based on fail_count to prevent excessive monitoring cycles
All checks were successful
BotServer CI/CD / build (push) Successful in 3m9s

- fail_count >= 3: sleep 1 hour
- fail_count >= 2: sleep 15 min
- fail_count >= 1: sleep 5 min
- fail_count = 0: sleep 10 sec (default)
This commit is contained in:
Rodrigo Rodriguez (Pragmatismo) 2026-04-12 09:20:17 -03:00
parent 789789e313
commit 256d55fc93

View file

@ -291,8 +291,31 @@ impl DriveMonitor {
{
debug!("[DRIVE_MONITOR] Inside monitoring loop for bot {}", self_clone.bot_id);
debug!("[DRIVE_MONITOR] Periodic check starting for bot {}", self_clone.bot_id);
// Use fixed 10 second interval instead of backoff calculation
tokio::time::sleep(Duration::from_secs(10)).await;
// Smart sleep based on fail_count - prevent excessive retries
{
let states = self_clone.file_states.read().await;
let max_fail_count = states.values()
.map(|s| s.fail_count)
.max()
.unwrap_or(0);
let base_sleep = if max_fail_count >= 3 {
3600 // 1 hour for fail_count >= 3
} else if max_fail_count >= 2 {
900 // 15 min for fail_count >= 2
} else if max_fail_count >= 1 {
300 // 5 min for fail_count >= 1
} else {
10 // 10 sec default
};
if base_sleep > 10 {
debug!("[DRIVE_MONITOR] Sleep {}s based on fail_count={}", base_sleep, max_fail_count);
}
tokio::time::sleep(Duration::from_secs(base_sleep)).await;
}
debug!("[DRIVE_MONITOR] Checking drive health for bot {}", self_clone.bot_id);
// Skip drive health check - just proceed with monitoring