use std::sync::{Arc, Mutex}; use super::event_store::EventStore; use super::models::*; use super::state_machine::{StateError, StateMachine}; /// Retry logic for failed/agent_lost tasks. pub struct RetryPolicy { sm: Arc, store: Arc>, } impl RetryPolicy { pub fn new(sm: Arc, store: Arc>) -> Self { Self { sm, store } } /// M5: Handle a failed task with a single atomic DB transaction. /// Reads the task, checks retry limit, increments retry_count, and transitions /// to Assigned — all under one lock + transaction to prevent TOCTOU races. pub async fn handle_failure( &self, task_id: &str, _agent_id: Option<&str>, reason: &str, ) -> Result { let task_id = task_id.to_string(); let reason = reason.to_string(); let store = self.store.clone(); let task_id_log = task_id.clone(); let retry_result = tokio::task::spawn_blocking(move || -> Result { let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?; let now = chrono::Utc::now(); let event = TaskEvent { event_id: uuid::Uuid::new_v4().to_string(), task_id: task_id.clone(), event_type: "task.assigned".into(), agent_id: None, timestamp: now, payload: serde_json::json!({ "from_status": "failed", "to_status": "assigned", "reason": format!("retry: {reason}"), }), }; let result = store.retry_and_transition( &task_id, TaskStatus::Assigned.as_str(), None, Some(now.to_rfc3339()), None, None, &event, )?; match result { Some((original, _updated)) => { let attempt = original.retry_count + 1; Ok(RetryDecision::Retried { attempt, max: original.max_retries, }) } None => Ok(RetryDecision::Exhausted), } }) .await .map_err(StateError::Join)??; if matches!(retry_result, RetryDecision::Exhausted) { tracing::warn!(task_id = task_id_log, "max retries exceeded"); } Ok(retry_result) } } #[derive(Debug, Clone, PartialEq, Eq)] pub enum RetryDecision { Retried { attempt: u32, max: u32 }, Exhausted, }