fix: resolve 3 CRITICAL + 5 MAJOR issues from Codex review
C1: Arc<Mutex<EventStore>> changed from tokio::sync to std::sync + spawn_blocking C2: StateMachine::transition merged into single lock scope C3: Transaction boundaries (BEGIN/COMMIT) on all composite writes M4: retry_count no longer overwritten by update_task_status M5: RetryPolicy::handle_failure now atomic (single lock + transaction) M6: Per-task timeout_seconds used in SQL instead of global config M7: Explicit Priority::order() method instead of relying on variant order M8: dequeue_and_assign uses CAS-style WHERE status='created' for atomicity
This commit is contained in:
parent
b1a4d66c13
commit
2658a74730
7 changed files with 434 additions and 235 deletions
|
|
@ -1,59 +1,80 @@
|
|||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use super::event_store::EventStore;
|
||||
use super::models::*;
|
||||
use super::state_machine::{StateError, StateMachine};
|
||||
use super::task_queue::TaskQueue;
|
||||
|
||||
/// Retry logic for failed/agent_lost tasks.
|
||||
pub struct RetryPolicy {
|
||||
sm: Arc<StateMachine>,
|
||||
_queue: Arc<TaskQueue>,
|
||||
store: Arc<Mutex<EventStore>>,
|
||||
}
|
||||
|
||||
impl RetryPolicy {
|
||||
pub fn new(
|
||||
sm: Arc<StateMachine>,
|
||||
queue: Arc<TaskQueue>,
|
||||
store: Arc<Mutex<EventStore>>,
|
||||
) -> Self {
|
||||
Self { sm, _queue: queue, store }
|
||||
pub fn new(sm: Arc<StateMachine>, store: Arc<Mutex<EventStore>>) -> Self {
|
||||
Self { sm, store }
|
||||
}
|
||||
|
||||
/// Handle a failed task: retry if under limit, otherwise mark permanently failed.
|
||||
/// M5: Handle a failed task with a single atomic DB transaction.
|
||||
/// Reads the task, checks retry limit, increments retry_count, and transitions
|
||||
/// to Assigned — all under one lock + transaction to prevent TOCTOU races.
|
||||
pub async fn handle_failure(
|
||||
&self,
|
||||
task_id: &str,
|
||||
_agent_id: Option<&str>,
|
||||
reason: &str,
|
||||
) -> Result<RetryDecision, StateError> {
|
||||
let task = {
|
||||
let store = self.store.lock().await;
|
||||
store.read_task(task_id)?.ok_or(StateError::TaskNotFound(task_id.to_string()))?
|
||||
};
|
||||
let task_id = task_id.to_string();
|
||||
let reason = reason.to_string();
|
||||
let store = self.store.clone();
|
||||
|
||||
if task.retry_count < task.max_retries {
|
||||
// Increment retry count
|
||||
{
|
||||
let store = self.store.lock().await;
|
||||
store.increment_retry_count(task_id)?;
|
||||
let task_id_log = task_id.clone();
|
||||
let retry_result = tokio::task::spawn_blocking(move || -> Result<RetryDecision, StateError> {
|
||||
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: task_id.clone(),
|
||||
event_type: "task.assigned".into(),
|
||||
agent_id: None,
|
||||
timestamp: now,
|
||||
payload: serde_json::json!({
|
||||
"from_status": "failed",
|
||||
"to_status": "assigned",
|
||||
"reason": format!("retry: {reason}"),
|
||||
}),
|
||||
};
|
||||
|
||||
let result = store.retry_and_transition(
|
||||
&task_id,
|
||||
TaskStatus::Assigned.as_str(),
|
||||
None,
|
||||
Some(now.to_rfc3339()),
|
||||
None,
|
||||
None,
|
||||
&event,
|
||||
)?;
|
||||
|
||||
match result {
|
||||
Some((original, _updated)) => {
|
||||
let attempt = original.retry_count + 1;
|
||||
Ok(RetryDecision::Retried {
|
||||
attempt,
|
||||
max: original.max_retries,
|
||||
})
|
||||
}
|
||||
None => Ok(RetryDecision::Exhausted),
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(StateError::Join)??;
|
||||
|
||||
// Transition back to assigned
|
||||
self.sm
|
||||
.transition(task_id, TaskStatus::Assigned, None, &format!("retry: {reason}"))
|
||||
.await?;
|
||||
|
||||
Ok(RetryDecision::Retried {
|
||||
attempt: task.retry_count + 1,
|
||||
max: task.max_retries,
|
||||
})
|
||||
} else {
|
||||
tracing::warn!(task_id = task_id, retries = task.retry_count, "max retries exceeded");
|
||||
Ok(RetryDecision::Exhausted)
|
||||
if matches!(retry_result, RetryDecision::Exhausted) {
|
||||
tracing::warn!(task_id = task_id_log, "max retries exceeded");
|
||||
}
|
||||
|
||||
Ok(retry_result)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue