fix: resolve 3 CRITICAL + 5 MAJOR issues from Codex review

C1: Arc<Mutex<EventStore>> changed from tokio::sync to std::sync + spawn_blocking
C2: StateMachine::transition merged into single lock scope
C3: Transaction boundaries (BEGIN/COMMIT) on all composite writes
M4: retry_count no longer overwritten by update_task_status
M5: RetryPolicy::handle_failure now atomic (single lock + transaction)
M6: Per-task timeout_seconds used in SQL instead of global config
M7: Explicit Priority::order() method instead of relying on variant order
M8: dequeue_and_assign uses CAS-style WHERE status='created' for atomicity
This commit is contained in:
Zer4tul 2026-05-11 19:08:18 +08:00
parent b1a4d66c13
commit 2658a74730
7 changed files with 434 additions and 235 deletions

View file

@ -1,5 +1,4 @@
use std::sync::Arc;
use tokio::sync::Mutex;
use std::sync::{Arc, Mutex};
use super::event_store::EventStore;
use super::models::*;
@ -21,35 +20,48 @@ impl TaskQueue {
self.sm.create_task(&task).await
}
/// Dequeue the highest-priority task matching the given capabilities.
/// M8: Dequeue the highest-priority task matching capabilities.
/// Atomically transitions to `Assigned` inside a single DB transaction
/// via `dequeue_and_assign`, preventing concurrent dequeue of the same task.
pub async fn dequeue(
&self,
required_capabilities: &[String],
agent_id: Option<&str>,
) -> Result<Option<Task>, StateError> {
let tasks = {
let store = self.store.lock().await;
store.query_queued_tasks()?
};
let caps = required_capabilities.to_vec();
let agent_id_owned = agent_id.map(String::from);
let store = self.store.clone();
if required_capabilities.is_empty() {
return Ok(tasks.into_iter().next());
}
tokio::task::spawn_blocking(move || -> Result<Option<Task>, StateError> {
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
let now = chrono::Utc::now();
for task in tasks {
let all_match = required_capabilities
.iter()
.all(|cap| {
task.labels.iter().any(|l| l == cap) || &task.task_type == cap
});
if all_match {
return Ok(Some(task));
}
}
let event = TaskEvent {
event_id: uuid::Uuid::new_v4().to_string(),
// task_id filled inside dequeue_and_assign
task_id: String::new(),
event_type: "task.assigned".into(),
agent_id: agent_id_owned.clone(),
timestamp: now,
payload: serde_json::json!({
"from_status": "created",
"to_status": "assigned",
"reason": "dequeued",
}),
};
Ok(None)
Ok(store.dequeue_and_assign(
&caps,
agent_id_owned.as_deref(),
now.to_rfc3339(),
&event,
)?)
})
.await
.map_err(StateError::Join)?
}
/// Re-queue a failed/agent_lost task (increment retry_count).
/// Re-queue a failed/agent_lost task (delegates to state machine transition).
pub async fn requeue(&self, task_id: &str) -> Result<Task, StateError> {
self.sm
.transition(task_id, TaskStatus::Assigned, None, "re-queued after failure")