agent-fleet/src/core/state_machine.rs
Zer4tul 2658a74730 fix: resolve 3 CRITICAL + 5 MAJOR issues from Codex review
C1: Arc<Mutex<EventStore>> changed from tokio::sync to std::sync + spawn_blocking
C2: StateMachine::transition merged into single lock scope
C3: Transaction boundaries (BEGIN/COMMIT) on all composite writes
M4: retry_count no longer overwritten by update_task_status
M5: RetryPolicy::handle_failure now atomic (single lock + transaction)
M6: Per-task timeout_seconds used in SQL instead of global config
M7: Explicit Priority::order() method instead of relying on variant order
M8: dequeue_and_assign uses CAS-style WHERE status='created' for atomicity
2026-05-11 19:08:18 +08:00

158 lines
5.1 KiB
Rust

use chrono::Utc;
use std::sync::{Arc, Mutex};
use super::event_store::EventStore;
use super::models::*;
pub struct StateMachine {
store: Arc<Mutex<EventStore>>,
}
impl StateMachine {
pub fn new(store: Arc<Mutex<EventStore>>) -> Self {
Self { store }
}
/// C1 + C2: Single lock scope, spawn_blocking, transactional transition.
pub async fn transition(
&self,
task_id: &str,
new_status: TaskStatus,
agent_id: Option<&str>,
reason: &str,
) -> Result<Task, StateError> {
let task_id = task_id.to_string();
let reason = reason.to_string();
let agent_id_owned = agent_id.map(String::from);
let store = self.store.clone();
tokio::task::spawn_blocking(move || -> Result<Task, StateError> {
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
let task = store
.read_task(&task_id)?
.ok_or_else(|| StateError::TaskNotFound(task_id.clone()))?;
Self::validate_transition(&task.status, &new_status)?;
let now = Utc::now();
let event = TaskEvent {
event_id: uuid::Uuid::new_v4().to_string(),
task_id: task_id.clone(),
event_type: format!("task.{}", new_status.as_str()),
agent_id: agent_id_owned.clone(),
timestamp: now,
payload: serde_json::json!({
"from_status": task.status.as_str(),
"to_status": new_status.as_str(),
"reason": reason,
}),
};
Ok(store.transition_task(
&task_id,
new_status.as_str(),
agent_id_owned.as_deref(),
if new_status == TaskStatus::Assigned {
Some(now.to_rfc3339())
} else {
None
},
if new_status == TaskStatus::Running {
Some(now.to_rfc3339())
} else {
None
},
if matches!(
new_status,
TaskStatus::Completed | TaskStatus::Failed | TaskStatus::Cancelled
) {
Some(now.to_rfc3339())
} else {
None
},
&event,
)?)
})
.await
.map_err(StateError::Join)?
}
pub async fn create_task(&self, task: &Task) -> Result<Task, StateError> {
let task = task.clone();
let store = self.store.clone();
tokio::task::spawn_blocking(move || -> Result<Task, StateError> {
let store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
store.insert_task(&task)?;
let event = TaskEvent {
event_id: uuid::Uuid::new_v4().to_string(),
task_id: task.task_id.clone(),
event_type: "task.created".into(),
agent_id: None,
timestamp: Utc::now(),
payload: serde_json::json!({ "source": task.source }),
};
store.append_event_direct(&event)?;
Ok(task)
})
.await
.map_err(StateError::Join)?
}
fn validate_transition(from: &TaskStatus, to: &TaskStatus) -> Result<(), StateError> {
let valid = match from {
TaskStatus::Created => matches!(to, TaskStatus::Assigned | TaskStatus::Cancelled),
TaskStatus::Assigned => matches!(to, TaskStatus::Running | TaskStatus::Cancelled),
TaskStatus::Running => matches!(
to,
TaskStatus::Completed
| TaskStatus::Failed
| TaskStatus::AgentLost
| TaskStatus::Cancelled
),
TaskStatus::Failed | TaskStatus::AgentLost => {
matches!(to, TaskStatus::Assigned | TaskStatus::Cancelled)
}
TaskStatus::Completed | TaskStatus::Cancelled => false,
};
if !valid {
return Err(StateError::InvalidTransition(
from.as_str().to_string(),
to.as_str().to_string(),
));
}
Ok(())
}
pub fn parse_status(s: &str) -> TaskStatus {
match s {
"created" => TaskStatus::Created,
"assigned" => TaskStatus::Assigned,
"running" => TaskStatus::Running,
"completed" => TaskStatus::Completed,
"failed" => TaskStatus::Failed,
"agent_lost" => TaskStatus::AgentLost,
"cancelled" => TaskStatus::Cancelled,
_ => TaskStatus::Created,
}
}
}
#[derive(Debug, thiserror::Error)]
pub enum StateError {
#[error("task not found: {0}")]
TaskNotFound(String),
#[error("invalid transition: {0} -> {1}")]
InvalidTransition(String, String),
#[error("database error: {0}")]
Database(#[from] rusqlite::Error),
#[error("task join error: {0}")]
Join(#[from] tokio::task::JoinError),
#[error("mutex poisoned: {0}")]
Poisoned(String),
}