agent-fleet/src/core/retry.rs
Zer4tul 2658a74730 fix: resolve 3 CRITICAL + 5 MAJOR issues from Codex review
C1: Arc<Mutex<EventStore>> changed from tokio::sync to std::sync + spawn_blocking
C2: StateMachine::transition merged into single lock scope
C3: Transaction boundaries (BEGIN/COMMIT) on all composite writes
M4: retry_count no longer overwritten by update_task_status
M5: RetryPolicy::handle_failure now atomic (single lock + transaction)
M6: Per-task timeout_seconds used in SQL instead of global config
M7: Explicit Priority::order() method instead of relying on variant order
M8: dequeue_and_assign uses CAS-style WHERE status='created' for atomicity
2026-05-11 19:08:18 +08:00

85 lines
2.7 KiB
Rust

use std::sync::{Arc, Mutex};
use super::event_store::EventStore;
use super::models::*;
use super::state_machine::{StateError, StateMachine};
/// Retry logic for failed/agent_lost tasks.
pub struct RetryPolicy {
sm: Arc<StateMachine>,
store: Arc<Mutex<EventStore>>,
}
impl RetryPolicy {
pub fn new(sm: Arc<StateMachine>, store: Arc<Mutex<EventStore>>) -> Self {
Self { sm, store }
}
/// M5: Handle a failed task with a single atomic DB transaction.
/// Reads the task, checks retry limit, increments retry_count, and transitions
/// to Assigned — all under one lock + transaction to prevent TOCTOU races.
pub async fn handle_failure(
&self,
task_id: &str,
_agent_id: Option<&str>,
reason: &str,
) -> Result<RetryDecision, StateError> {
let task_id = task_id.to_string();
let reason = reason.to_string();
let store = self.store.clone();
let task_id_log = task_id.clone();
let retry_result = tokio::task::spawn_blocking(move || -> Result<RetryDecision, StateError> {
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
let now = chrono::Utc::now();
let event = TaskEvent {
event_id: uuid::Uuid::new_v4().to_string(),
task_id: task_id.clone(),
event_type: "task.assigned".into(),
agent_id: None,
timestamp: now,
payload: serde_json::json!({
"from_status": "failed",
"to_status": "assigned",
"reason": format!("retry: {reason}"),
}),
};
let result = store.retry_and_transition(
&task_id,
TaskStatus::Assigned.as_str(),
None,
Some(now.to_rfc3339()),
None,
None,
&event,
)?;
match result {
Some((original, _updated)) => {
let attempt = original.retry_count + 1;
Ok(RetryDecision::Retried {
attempt,
max: original.max_retries,
})
}
None => Ok(RetryDecision::Exhausted),
}
})
.await
.map_err(StateError::Join)??;
if matches!(retry_result, RetryDecision::Exhausted) {
tracing::warn!(task_id = task_id_log, "max retries exceeded");
}
Ok(retry_result)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RetryDecision {
Retried { attempt: u32, max: u32 },
Exhausted,
}