Add smart sleep based on fail_count to prevent excessive monitoring cycles

- fail_count >= 3: sleep 1 hour - fail_count >= 2: sleep 15 min - fail_count >= 1: sleep 5 min - fail_count = 0: sleep 10 sec (default)
2026-04-12 09:20:17 -03:00 · 2026-04-12 09:20:17 -03:00 · 256d55fc93
commit 256d55fc93
parent 789789e313
1 changed files with 25 additions and 2 deletions
--- a/src/drive/drive_monitor/mod.rs
+++ b/src/drive/drive_monitor/mod.rs
@ -291,8 +291,31 @@ impl DriveMonitor {
            {
                debug!("[DRIVE_MONITOR] Inside monitoring loop for bot {}", self_clone.bot_id);
                debug!("[DRIVE_MONITOR] Periodic check starting for bot {}", self_clone.bot_id);
-                // Use fixed 10 second interval instead of backoff calculation
-                tokio::time::sleep(Duration::from_secs(10)).await;
+                
+                // Smart sleep based on fail_count - prevent excessive retries
+                {
+                    let states = self_clone.file_states.read().await;
+                    let max_fail_count = states.values()
+                        .map(|s| s.fail_count)
+                        .max()
+                        .unwrap_or(0);
+                    
+                    let base_sleep = if max_fail_count >= 3 {
+                        3600 // 1 hour for fail_count >= 3
+                    } else if max_fail_count >= 2 {
+                        900  // 15 min for fail_count >= 2
+                    } else if max_fail_count >= 1 {
+                        300  // 5 min for fail_count >= 1
+                    } else {
+                        10   // 10 sec default
+                    };
+                    
+                    if base_sleep > 10 {
+                        debug!("[DRIVE_MONITOR] Sleep {}s based on fail_count={}", base_sleep, max_fail_count);
+                    }
+                    
+                    tokio::time::sleep(Duration::from_secs(base_sleep)).await;
+                }

                debug!("[DRIVE_MONITOR] Checking drive health for bot {}", self_clone.bot_id);
                // Skip drive health check - just proceed with monitoring