feat: dual execution model (SSH CLI + HTTP pull)
- ExecutionMode enum: SshCli (orchestrator dispatches) | HttpPull (agent pulls) - SSH CLI executor: spawn remote agents via ssh + CLI template - Local subprocess as SSH special case (localhost) - HostConfig with capability matching and load-based selection - Dispatch loop: scan created tasks → select host → execute → update - CliAdapterConfig: CLI templates for Codex and Claude Code - Structured prompt construction (Issue → goal/constraints/validation) - Output parsers: Codex JSON, Claude Code JSON, raw fallback - TaskStatus::ReviewPending + review_count loop limit - Forgejo webhook: pull_request (opened→review_pending, merged→completed) - Forgejo webhook: push events (task/* branch → last_activity_at) - HTTP API: dequeue only returns http_pull tasks - HTTP API: status update only for http_pull mode - Token auth config for http_pull agents - Adapter module rewritten: AgentAdapter trait removed → config-driven CLI templates - New fields: execution_mode, assigned_host, branch_name, pr_title, last_activity_at, review_count - 30/30 tests pass
This commit is contained in:
parent
1bc7580ecc
commit
e39a16498c
34 changed files with 2541 additions and 1555 deletions
|
|
@ -2,7 +2,9 @@ use chrono::Utc;
|
|||
use rusqlite::{params, Connection, Result as SqlResult};
|
||||
use std::path::Path;
|
||||
|
||||
use super::models::{Agent, AgentStatus, AgentType, Priority, Task, TaskEvent, TaskStatus};
|
||||
use super::models::{
|
||||
Agent, AgentStatus, AgentType, ExecutionMode, Priority, Task, TaskEvent, TaskStatus,
|
||||
};
|
||||
|
||||
pub struct EventStore {
|
||||
conn: Connection,
|
||||
|
|
@ -52,26 +54,43 @@ impl EventStore {
|
|||
task_type TEXT NOT NULL,
|
||||
priority TEXT NOT NULL DEFAULT 'normal',
|
||||
status TEXT NOT NULL DEFAULT 'created',
|
||||
execution_mode TEXT NOT NULL DEFAULT 'ssh_cli',
|
||||
assigned_agent_id TEXT,
|
||||
assigned_host TEXT,
|
||||
requirements TEXT NOT NULL DEFAULT '',
|
||||
labels TEXT NOT NULL DEFAULT '[]',
|
||||
branch_name TEXT,
|
||||
pr_title TEXT,
|
||||
created_at TEXT NOT NULL,
|
||||
assigned_at TEXT,
|
||||
started_at TEXT,
|
||||
completed_at TEXT,
|
||||
last_activity_at TEXT,
|
||||
retry_count INTEGER NOT NULL DEFAULT 0,
|
||||
max_retries INTEGER NOT NULL DEFAULT 2,
|
||||
review_count INTEGER NOT NULL DEFAULT 0,
|
||||
timeout_seconds INTEGER NOT NULL DEFAULT 1800
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_status ON tasks(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_assigned ON tasks(assigned_agent_id);",
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_assigned ON tasks(assigned_agent_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_tasks_execution_mode ON tasks(execution_mode);",
|
||||
)?;
|
||||
|
||||
let _ = self
|
||||
.conn
|
||||
.execute("ALTER TABLE tasks ADD COLUMN execution_mode TEXT NOT NULL DEFAULT 'ssh_cli'", []);
|
||||
let _ = self.conn.execute("ALTER TABLE tasks ADD COLUMN assigned_host TEXT", []);
|
||||
let _ = self.conn.execute("ALTER TABLE tasks ADD COLUMN branch_name TEXT", []);
|
||||
let _ = self.conn.execute("ALTER TABLE tasks ADD COLUMN pr_title TEXT", []);
|
||||
let _ = self.conn.execute("ALTER TABLE tasks ADD COLUMN last_activity_at TEXT", []);
|
||||
let _ = self
|
||||
.conn
|
||||
.execute("ALTER TABLE tasks ADD COLUMN review_count INTEGER NOT NULL DEFAULT 0", []);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ─── Agent operations ────────────────────────────────────────
|
||||
|
||||
pub fn upsert_agent(&mut self, agent: &Agent) -> SqlResult<()> {
|
||||
self.conn.execute(
|
||||
"INSERT INTO agents (
|
||||
|
|
@ -83,6 +102,7 @@ impl EventStore {
|
|||
hostname = excluded.hostname,
|
||||
capabilities = excluded.capabilities,
|
||||
max_concurrency = excluded.max_concurrency,
|
||||
current_tasks = excluded.current_tasks,
|
||||
status = excluded.status,
|
||||
last_heartbeat_at = excluded.last_heartbeat_at,
|
||||
metadata = excluded.metadata",
|
||||
|
|
@ -104,31 +124,19 @@ impl EventStore {
|
|||
|
||||
pub fn update_heartbeat(&mut self, agent_id: &str) -> SqlResult<()> {
|
||||
self.conn.execute(
|
||||
"UPDATE agents
|
||||
SET last_heartbeat_at = ?1,
|
||||
status = 'online'
|
||||
WHERE agent_id = ?2",
|
||||
"UPDATE agents SET last_heartbeat_at = ?1, status = 'online' WHERE agent_id = ?2",
|
||||
params![Utc::now().to_rfc3339(), agent_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn set_agent_offline(
|
||||
&mut self,
|
||||
agent_id: &str,
|
||||
task_recovery_status: TaskStatus,
|
||||
) -> SqlResult<usize> {
|
||||
pub fn set_agent_offline(&mut self, agent_id: &str, task_recovery_status: TaskStatus) -> SqlResult<usize> {
|
||||
let tx = self.conn.transaction()?;
|
||||
|
||||
tx.execute(
|
||||
"UPDATE agents SET status = 'offline' WHERE agent_id = ?1",
|
||||
params![agent_id],
|
||||
)?;
|
||||
tx.execute("UPDATE agents SET status = 'offline', current_tasks = 0 WHERE agent_id = ?1", params![agent_id])?;
|
||||
|
||||
let running_task_ids: Vec<String> = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id FROM tasks
|
||||
WHERE assigned_agent_id = ?1 AND status = 'running'",
|
||||
"SELECT task_id FROM tasks WHERE assigned_agent_id = ?1 AND status IN ('assigned','running','review_pending')",
|
||||
)?;
|
||||
stmt.query_map(params![agent_id], |row| row.get(0))?
|
||||
.collect::<SqlResult<Vec<_>>>()?
|
||||
|
|
@ -139,6 +147,7 @@ impl EventStore {
|
|||
"UPDATE tasks
|
||||
SET status = ?1,
|
||||
assigned_agent_id = NULL,
|
||||
assigned_host = NULL,
|
||||
assigned_at = NULL,
|
||||
started_at = NULL
|
||||
WHERE task_id = ?2",
|
||||
|
|
@ -151,13 +160,7 @@ impl EventStore {
|
|||
event_type: format!("task.{}", task_recovery_status.as_str()),
|
||||
agent_id: Some(agent_id.to_string()),
|
||||
timestamp: Utc::now(),
|
||||
payload: serde_json::json!({
|
||||
"reason": if task_recovery_status == TaskStatus::Created {
|
||||
"agent_deregistered"
|
||||
} else {
|
||||
"agent_heartbeat_timeout"
|
||||
}
|
||||
}),
|
||||
payload: serde_json::json!({"reason":"agent_offline"}),
|
||||
};
|
||||
Self::append_event(&tx, &event)?;
|
||||
}
|
||||
|
|
@ -166,29 +169,19 @@ impl EventStore {
|
|||
Ok(running_task_ids.len())
|
||||
}
|
||||
|
||||
pub fn list_agents(
|
||||
&self,
|
||||
capability: Option<&str>,
|
||||
status: Option<&AgentStatus>,
|
||||
) -> SqlResult<Vec<Agent>> {
|
||||
pub fn list_agents(&self, capability: Option<&str>, status: Option<&AgentStatus>) -> SqlResult<Vec<Agent>> {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT agent_id, agent_type, hostname, capabilities, max_concurrency,
|
||||
current_tasks, status, last_heartbeat_at, registered_at, metadata
|
||||
FROM agents
|
||||
ORDER BY agent_id ASC",
|
||||
FROM agents ORDER BY agent_id ASC",
|
||||
)?;
|
||||
|
||||
let mut agents: Vec<Agent> = stmt
|
||||
.query_map([], Self::row_to_agent)?
|
||||
.collect::<SqlResult<Vec<_>>>()?;
|
||||
|
||||
let mut agents: Vec<Agent> = stmt.query_map([], Self::row_to_agent)?.collect::<SqlResult<Vec<_>>>()?;
|
||||
if let Some(cap) = capability {
|
||||
agents.retain(|agent| agent.capabilities.iter().any(|c| c == cap));
|
||||
}
|
||||
if let Some(status) = status {
|
||||
agents.retain(|agent| &agent.status == status);
|
||||
}
|
||||
|
||||
Ok(agents)
|
||||
}
|
||||
|
||||
|
|
@ -215,26 +208,12 @@ impl EventStore {
|
|||
.collect::<SqlResult<Vec<_>>>()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn force_agent_last_heartbeat(
|
||||
&mut self,
|
||||
agent_id: &str,
|
||||
timestamp: chrono::DateTime<Utc>,
|
||||
) -> SqlResult<()> {
|
||||
self.conn.execute(
|
||||
"UPDATE agents SET last_heartbeat_at = ?1 WHERE agent_id = ?2",
|
||||
params![timestamp.to_rfc3339(), agent_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// ─── Task/event read operations ──────────────────────────────
|
||||
|
||||
pub fn read_task(&self, task_id: &str) -> SqlResult<Option<Task>> {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, assigned_agent_id,
|
||||
requirements, labels, created_at, assigned_at, started_at, completed_at,
|
||||
retry_count, max_retries, timeout_seconds
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
match stmt.query_row(params![task_id], Self::row_to_task) {
|
||||
|
|
@ -244,97 +223,261 @@ impl EventStore {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn get_events_for_task(&self, task_id: &str) -> SqlResult<Vec<TaskEvent>> {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT event_id, task_id, event_type, agent_id, timestamp, payload
|
||||
FROM task_events WHERE task_id = ?1 ORDER BY timestamp ASC",
|
||||
)?;
|
||||
stmt.query_map(params![task_id], |row| {
|
||||
let timestamp_str: String = row.get(4)?;
|
||||
let payload_str: String = row.get(5)?;
|
||||
Ok(TaskEvent {
|
||||
event_id: row.get(0)?,
|
||||
task_id: row.get(1)?,
|
||||
event_type: row.get(2)?,
|
||||
agent_id: row.get(3)?,
|
||||
timestamp: timestamp_str.parse().unwrap_or_default(),
|
||||
payload: serde_json::from_str(&payload_str).unwrap_or(serde_json::Value::Null),
|
||||
})
|
||||
})?
|
||||
.collect::<SqlResult<Vec<_>>>()
|
||||
}
|
||||
|
||||
pub fn find_timed_out_tasks(&self) -> SqlResult<Vec<String>> {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT task_id FROM tasks
|
||||
WHERE status = 'running'
|
||||
AND started_at IS NOT NULL
|
||||
AND (julianday('now') - julianday(started_at)) * 86400 > timeout_seconds",
|
||||
)?;
|
||||
stmt.query_map([], |row| row.get(0))?
|
||||
.collect::<SqlResult<Vec<_>>>()
|
||||
}
|
||||
|
||||
pub fn list_tasks(
|
||||
&self,
|
||||
status: Option<&str>,
|
||||
agent_id: Option<&str>,
|
||||
) -> SqlResult<Vec<Task>> {
|
||||
pub fn list_tasks(&self, status: Option<&str>, agent_id: Option<&str>) -> SqlResult<Vec<Task>> {
|
||||
let mut sql = String::from(
|
||||
"SELECT task_id, source, task_type, priority, status, assigned_agent_id,
|
||||
requirements, labels, created_at, assigned_at, started_at, completed_at,
|
||||
retry_count, max_retries, timeout_seconds
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE 1=1",
|
||||
);
|
||||
let mut param_values: Vec<Box<dyn rusqlite::types::ToSql>> = Vec::new();
|
||||
|
||||
if let Some(s) = status {
|
||||
let mut bindings: Vec<String> = Vec::new();
|
||||
if let Some(status) = status {
|
||||
sql.push_str(" AND status = ?");
|
||||
param_values.push(Box::new(s.to_string()));
|
||||
bindings.push(status.to_string());
|
||||
}
|
||||
if let Some(a) = agent_id {
|
||||
if let Some(agent_id) = agent_id {
|
||||
sql.push_str(" AND assigned_agent_id = ?");
|
||||
param_values.push(Box::new(a.to_string()));
|
||||
bindings.push(agent_id.to_string());
|
||||
}
|
||||
sql.push_str(" ORDER BY created_at DESC");
|
||||
|
||||
let params: Vec<&dyn rusqlite::types::ToSql> = param_values.iter().map(|p| p.as_ref()).collect();
|
||||
|
||||
let mut stmt = self.conn.prepare(&sql)?;
|
||||
stmt.query_map(params.as_slice(), Self::row_to_task)?
|
||||
.collect::<SqlResult<Vec<_>>>()
|
||||
let rows = stmt.query_map(
|
||||
rusqlite::params_from_iter(bindings.iter()),
|
||||
Self::row_to_task,
|
||||
)?;
|
||||
rows.collect::<SqlResult<Vec<_>>>()
|
||||
}
|
||||
|
||||
// ─── Task/event write operations ─────────────────────────────
|
||||
|
||||
pub fn insert_task(&self, task: &Task) -> SqlResult<()> {
|
||||
self.conn.execute(
|
||||
"INSERT INTO tasks (
|
||||
task_id, source, task_type, priority, status, assigned_agent_id,
|
||||
requirements, labels, created_at, assigned_at, started_at, completed_at,
|
||||
retry_count, max_retries, timeout_seconds
|
||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15)",
|
||||
task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18, ?19, ?20, ?21)",
|
||||
params![
|
||||
task.task_id,
|
||||
task.source,
|
||||
task.task_type,
|
||||
task.priority.as_str(),
|
||||
task.status.as_str(),
|
||||
task.execution_mode.as_str(),
|
||||
task.assigned_agent_id,
|
||||
task.assigned_host,
|
||||
task.requirements,
|
||||
serde_json::to_string(&task.labels).unwrap_or_default(),
|
||||
task.branch_name,
|
||||
task.pr_title,
|
||||
task.created_at.to_rfc3339(),
|
||||
task.assigned_at.map(|v| v.to_rfc3339()),
|
||||
task.started_at.map(|v| v.to_rfc3339()),
|
||||
task.completed_at.map(|v| v.to_rfc3339()),
|
||||
task.last_activity_at.map(|v| v.to_rfc3339()),
|
||||
task.retry_count,
|
||||
task.max_retries,
|
||||
task.timeout_seconds as i64,
|
||||
task.review_count,
|
||||
task.timeout_seconds,
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn transition_task(
|
||||
&mut self,
|
||||
task_id: &str,
|
||||
status: &str,
|
||||
agent_id: Option<&str>,
|
||||
assigned_host: Option<&str>,
|
||||
assigned_at: Option<String>,
|
||||
started_at: Option<String>,
|
||||
completed_at: Option<String>,
|
||||
review_count_increment: bool,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Task> {
|
||||
let tx = self.conn.transaction()?;
|
||||
tx.execute(
|
||||
"UPDATE tasks
|
||||
SET status = ?1,
|
||||
assigned_agent_id = COALESCE(?2, assigned_agent_id),
|
||||
assigned_host = COALESCE(?3, assigned_host),
|
||||
assigned_at = COALESCE(?4, assigned_at),
|
||||
started_at = COALESCE(?5, started_at),
|
||||
completed_at = COALESCE(?6, completed_at),
|
||||
review_count = review_count + CASE WHEN ?7 THEN 1 ELSE 0 END
|
||||
WHERE task_id = ?8",
|
||||
params![status, agent_id, assigned_host, assigned_at, started_at, completed_at, review_count_increment, task_id],
|
||||
)?;
|
||||
Self::append_event(&tx, event)?;
|
||||
let task = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
let result = stmt.query_row(params![task_id], Self::row_to_task)?;
|
||||
drop(stmt);
|
||||
result
|
||||
};
|
||||
tx.commit()?;
|
||||
Ok(task)
|
||||
}
|
||||
|
||||
pub fn update_task_activity(&mut self, task_id: &str, timestamp: &str) -> SqlResult<()> {
|
||||
self.conn.execute(
|
||||
"UPDATE tasks SET last_activity_at = ?1 WHERE task_id = ?2",
|
||||
params![timestamp, task_id],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn dequeue_and_assign_http_pull(
|
||||
&mut self,
|
||||
required_capabilities: &[String],
|
||||
agent_id: Option<&str>,
|
||||
now: String,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Option<Task>> {
|
||||
let tx = self.conn.transaction()?;
|
||||
let candidate = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks
|
||||
WHERE status = 'created' AND execution_mode = 'http_pull'
|
||||
ORDER BY CASE priority
|
||||
WHEN 'urgent' THEN 0
|
||||
WHEN 'high' THEN 1
|
||||
WHEN 'normal' THEN 2
|
||||
ELSE 3 END,
|
||||
created_at ASC",
|
||||
)?;
|
||||
let tasks: Vec<Task> = stmt.query_map([], Self::row_to_task)?.collect::<SqlResult<Vec<_>>>()?;
|
||||
tasks.into_iter().find(|task| {
|
||||
required_capabilities.is_empty()
|
||||
|| required_capabilities.iter().all(|cap| task.labels.iter().any(|l| l == cap))
|
||||
})
|
||||
}; // stmt dropped here
|
||||
|
||||
let Some(task) = candidate else {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
tx.execute(
|
||||
"UPDATE tasks SET status = 'assigned', assigned_agent_id = ?1, assigned_at = ?2 WHERE task_id = ?3",
|
||||
params![agent_id, now, task.task_id],
|
||||
)?;
|
||||
let mut event = event.clone();
|
||||
event.task_id = task.task_id.clone();
|
||||
Self::append_event(&tx, &event)?;
|
||||
|
||||
let task_id = task.task_id.clone();
|
||||
let updated = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
stmt.query_row(params![task_id], Self::row_to_task)?
|
||||
}; // stmt dropped here
|
||||
tx.commit()?;
|
||||
Ok(Some(updated))
|
||||
}
|
||||
|
||||
pub fn find_timed_out_tasks(&self) -> SqlResult<Vec<String>> {
|
||||
let mut stmt = self.conn.prepare(
|
||||
"SELECT task_id, timeout_seconds, started_at FROM tasks WHERE status IN ('assigned', 'running')",
|
||||
)?;
|
||||
let rows: Vec<(String, u64, Option<String>)> = stmt
|
||||
.query_map([], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)))?
|
||||
.collect::<SqlResult<Vec<_>>>()?;
|
||||
let now = Utc::now();
|
||||
let timed_out: Vec<String> = rows
|
||||
.into_iter()
|
||||
.filter_map(|(task_id, timeout_secs, started_at)| {
|
||||
let started = started_at.and_then(|s| s.parse::<chrono::DateTime<Utc>>().ok())?;
|
||||
let elapsed = (now - started).num_seconds();
|
||||
if elapsed > timeout_secs as i64 {
|
||||
Some(task_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
Ok(timed_out)
|
||||
}
|
||||
|
||||
pub fn retry_and_transition(
|
||||
&mut self,
|
||||
task_id: &str,
|
||||
new_status: &str,
|
||||
agent_id: Option<&str>,
|
||||
assigned_at: Option<String>,
|
||||
started_at: Option<String>,
|
||||
completed_at: Option<String>,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Option<(Task, Task)>> {
|
||||
let tx = self.conn.transaction()?;
|
||||
let original = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
let result = match stmt.query_row(params![task_id], Self::row_to_task) {
|
||||
Ok(task) => Ok(Some(task)),
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
|
||||
Err(e) => Err(e),
|
||||
};
|
||||
drop(stmt);
|
||||
result?
|
||||
};
|
||||
|
||||
let Some(original) = original else {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
if original.retry_count >= original.max_retries {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
tx.execute(
|
||||
"UPDATE tasks SET status = ?1, assigned_agent_id = ?2, assigned_at = ?3, started_at = ?4, completed_at = ?5,
|
||||
retry_count = retry_count + 1
|
||||
WHERE task_id = ?6",
|
||||
params![new_status, agent_id, assigned_at, started_at, completed_at, task_id],
|
||||
)?;
|
||||
Self::append_event(&tx, event)?;
|
||||
|
||||
let updated = {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, execution_mode, assigned_agent_id,
|
||||
assigned_host, requirements, labels, branch_name, pr_title, created_at, assigned_at,
|
||||
started_at, completed_at, last_activity_at, retry_count, max_retries, review_count,
|
||||
timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
let result = stmt.query_row(params![task_id], Self::row_to_task)?;
|
||||
drop(stmt);
|
||||
result
|
||||
};
|
||||
tx.commit()?;
|
||||
Ok(Some((original, updated)))
|
||||
}
|
||||
|
||||
pub fn append_event_direct(&self, event: &TaskEvent) -> SqlResult<()> {
|
||||
Self::append_event(&self.conn, event)
|
||||
}
|
||||
|
|
@ -349,322 +492,66 @@ impl EventStore {
|
|||
event.event_type,
|
||||
event.agent_id,
|
||||
event.timestamp.to_rfc3339(),
|
||||
serde_json::to_string(&event.payload).unwrap_or_default(),
|
||||
serde_json::to_string(&event.payload).unwrap_or_else(|_| "{}".into()),
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn transition_task(
|
||||
&mut self,
|
||||
task_id: &str,
|
||||
status: &str,
|
||||
agent_id: Option<&str>,
|
||||
assigned_at: Option<String>,
|
||||
started_at: Option<String>,
|
||||
completed_at: Option<String>,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Task> {
|
||||
let tx = self.conn.transaction()?;
|
||||
|
||||
tx.execute(
|
||||
"UPDATE tasks SET status = ?1,
|
||||
assigned_agent_id = COALESCE(?2, assigned_agent_id),
|
||||
assigned_at = COALESCE(?3, assigned_at),
|
||||
started_at = COALESCE(?4, started_at),
|
||||
completed_at = COALESCE(?5, completed_at)
|
||||
WHERE task_id = ?6",
|
||||
params![status, agent_id, assigned_at, started_at, completed_at, task_id],
|
||||
)?;
|
||||
|
||||
Self::append_event(&tx, event)?;
|
||||
|
||||
let updated = Self::read_task_in_tx(&tx, task_id)?
|
||||
.ok_or(rusqlite::Error::QueryReturnedNoRows)?;
|
||||
|
||||
tx.commit()?;
|
||||
Ok(updated)
|
||||
fn row_to_agent(row: &rusqlite::Row<'_>) -> SqlResult<Agent> {
|
||||
Ok(Agent {
|
||||
agent_id: row.get(0)?,
|
||||
agent_type: AgentType::from_str(&row.get::<_, String>(1)?),
|
||||
hostname: row.get(2)?,
|
||||
capabilities: serde_json::from_str(&row.get::<_, String>(3)?).unwrap_or_default(),
|
||||
max_concurrency: row.get(4)?,
|
||||
current_tasks: row.get(5)?,
|
||||
status: AgentStatus::from_str(&row.get::<_, String>(6)?),
|
||||
last_heartbeat_at: row.get::<_, String>(7)?.parse().unwrap_or_else(|_| Utc::now()),
|
||||
registered_at: row.get::<_, String>(8)?.parse().unwrap_or_else(|_| Utc::now()),
|
||||
metadata: serde_json::from_str(&row.get::<_, String>(9)?).unwrap_or_default(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn retry_and_transition(
|
||||
&mut self,
|
||||
task_id: &str,
|
||||
status: &str,
|
||||
agent_id: Option<&str>,
|
||||
assigned_at: Option<String>,
|
||||
started_at: Option<String>,
|
||||
completed_at: Option<String>,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Option<(Task, Task)>> {
|
||||
let tx = self.conn.transaction()?;
|
||||
|
||||
let original = match Self::read_task_in_tx(&tx, task_id)? {
|
||||
Some(t) => t,
|
||||
None => return Ok(None),
|
||||
fn row_to_task(row: &rusqlite::Row<'_>) -> SqlResult<Task> {
|
||||
let priority = match row.get::<_, String>(3)?.as_str() {
|
||||
"urgent" => Priority::Urgent,
|
||||
"high" => Priority::High,
|
||||
"low" => Priority::Low,
|
||||
_ => Priority::Normal,
|
||||
};
|
||||
|
||||
if original.retry_count >= original.max_retries {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
tx.execute(
|
||||
"UPDATE tasks SET
|
||||
retry_count = retry_count + 1,
|
||||
status = ?1,
|
||||
assigned_agent_id = COALESCE(?2, assigned_agent_id),
|
||||
assigned_at = COALESCE(?3, assigned_at),
|
||||
started_at = COALESCE(?4, started_at),
|
||||
completed_at = COALESCE(?5, completed_at)
|
||||
WHERE task_id = ?6",
|
||||
params![status, agent_id, assigned_at, started_at, completed_at, task_id],
|
||||
)?;
|
||||
|
||||
Self::append_event(&tx, event)?;
|
||||
|
||||
let updated = Self::read_task_in_tx(&tx, task_id)?
|
||||
.ok_or(rusqlite::Error::QueryReturnedNoRows)?;
|
||||
|
||||
tx.commit()?;
|
||||
Ok(Some((original, updated)))
|
||||
}
|
||||
|
||||
pub fn dequeue_and_assign(
|
||||
&mut self,
|
||||
required_capabilities: &[String],
|
||||
agent_id: Option<&str>,
|
||||
assigned_at: String,
|
||||
event: &TaskEvent,
|
||||
) -> SqlResult<Option<Task>> {
|
||||
let tx = self.conn.transaction()?;
|
||||
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, assigned_agent_id,
|
||||
requirements, labels, created_at, assigned_at, started_at, completed_at,
|
||||
retry_count, max_retries, timeout_seconds
|
||||
FROM tasks
|
||||
WHERE status = 'created'
|
||||
ORDER BY
|
||||
CASE priority
|
||||
WHEN 'urgent' THEN 0
|
||||
WHEN 'high' THEN 1
|
||||
WHEN 'normal' THEN 2
|
||||
WHEN 'low' THEN 3
|
||||
END,
|
||||
created_at ASC",
|
||||
)?;
|
||||
|
||||
let candidates: Vec<Task> = stmt
|
||||
.query_map([], Self::row_to_task)?
|
||||
.collect::<SqlResult<Vec<_>>>()?;
|
||||
drop(stmt);
|
||||
|
||||
let matched = if required_capabilities.is_empty() {
|
||||
candidates.into_iter().next()
|
||||
} else {
|
||||
candidates.into_iter().find(|t| {
|
||||
required_capabilities
|
||||
.iter()
|
||||
.all(|cap| t.labels.iter().any(|l| l == cap) || &t.task_type == cap)
|
||||
})
|
||||
let status = match row.get::<_, String>(4)?.as_str() {
|
||||
"assigned" => TaskStatus::Assigned,
|
||||
"running" => TaskStatus::Running,
|
||||
"review_pending" => TaskStatus::ReviewPending,
|
||||
"completed" => TaskStatus::Completed,
|
||||
"failed" => TaskStatus::Failed,
|
||||
"agent_lost" => TaskStatus::AgentLost,
|
||||
"cancelled" => TaskStatus::Cancelled,
|
||||
_ => TaskStatus::Created,
|
||||
};
|
||||
|
||||
let Some(task) = matched else {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
tx.execute(
|
||||
"UPDATE tasks
|
||||
SET status = 'assigned',
|
||||
assigned_agent_id = COALESCE(?1, assigned_agent_id),
|
||||
assigned_at = ?2
|
||||
WHERE task_id = ?3 AND status = 'created'",
|
||||
params![agent_id, assigned_at, task.task_id],
|
||||
)?;
|
||||
|
||||
if tx.changes() == 0 {
|
||||
tx.commit()?;
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut event = event.clone();
|
||||
event.task_id = task.task_id.clone();
|
||||
Self::append_event(&tx, &event)?;
|
||||
|
||||
let updated = Self::read_task_in_tx(&tx, &task.task_id)?
|
||||
.ok_or(rusqlite::Error::QueryReturnedNoRows)?;
|
||||
|
||||
tx.commit()?;
|
||||
Ok(Some(updated))
|
||||
}
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────
|
||||
|
||||
fn read_task_in_tx(tx: &rusqlite::Transaction<'_>, task_id: &str) -> SqlResult<Option<Task>> {
|
||||
let mut stmt = tx.prepare(
|
||||
"SELECT task_id, source, task_type, priority, status, assigned_agent_id,
|
||||
requirements, labels, created_at, assigned_at, started_at, completed_at,
|
||||
retry_count, max_retries, timeout_seconds
|
||||
FROM tasks WHERE task_id = ?1",
|
||||
)?;
|
||||
match stmt.query_row(params![task_id], Self::row_to_task) {
|
||||
Ok(task) => Ok(Some(task)),
|
||||
Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
|
||||
Err(e) => Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
fn row_to_task(row: &rusqlite::Row) -> SqlResult<Task> {
|
||||
let priority_str: String = row.get(3)?;
|
||||
let status_str: String = row.get(4)?;
|
||||
let labels_str: String = row.get(7)?;
|
||||
|
||||
Ok(Task {
|
||||
task_id: row.get(0)?,
|
||||
source: row.get(1)?,
|
||||
task_type: row.get(2)?,
|
||||
priority: match priority_str.as_str() {
|
||||
"urgent" => Priority::Urgent,
|
||||
"high" => Priority::High,
|
||||
"normal" => Priority::Normal,
|
||||
"low" => Priority::Low,
|
||||
_ => Priority::Normal,
|
||||
},
|
||||
status: match status_str.as_str() {
|
||||
"created" => TaskStatus::Created,
|
||||
"assigned" => TaskStatus::Assigned,
|
||||
"running" => TaskStatus::Running,
|
||||
"completed" => TaskStatus::Completed,
|
||||
"failed" => TaskStatus::Failed,
|
||||
"agent_lost" => TaskStatus::AgentLost,
|
||||
"cancelled" => TaskStatus::Cancelled,
|
||||
_ => TaskStatus::Created,
|
||||
},
|
||||
assigned_agent_id: row.get(5)?,
|
||||
requirements: row.get(6)?,
|
||||
labels: serde_json::from_str(&labels_str).unwrap_or_default(),
|
||||
created_at: row.get::<_, String>(8)?.parse().unwrap_or_default(),
|
||||
assigned_at: row.get::<_, Option<String>>(9)?.and_then(|s| s.parse().ok()),
|
||||
started_at: row.get::<_, Option<String>>(10)?.and_then(|s| s.parse().ok()),
|
||||
completed_at: row.get::<_, Option<String>>(11)?.and_then(|s| s.parse().ok()),
|
||||
retry_count: row.get(12)?,
|
||||
max_retries: row.get(13)?,
|
||||
timeout_seconds: row.get::<_, i64>(14)? as u64,
|
||||
})
|
||||
}
|
||||
|
||||
fn row_to_agent(row: &rusqlite::Row) -> SqlResult<Agent> {
|
||||
let agent_type_str: String = row.get(1)?;
|
||||
let capabilities_str: String = row.get(3)?;
|
||||
let status_str: String = row.get(6)?;
|
||||
let last_heartbeat_at: String = row.get(7)?;
|
||||
let registered_at: String = row.get(8)?;
|
||||
let metadata_str: String = row.get(9)?;
|
||||
|
||||
Ok(Agent {
|
||||
agent_id: row.get(0)?,
|
||||
agent_type: AgentType::from_str(&agent_type_str),
|
||||
hostname: row.get(2)?,
|
||||
capabilities: serde_json::from_str(&capabilities_str).unwrap_or_default(),
|
||||
max_concurrency: row.get(4)?,
|
||||
current_tasks: row.get(5)?,
|
||||
status: AgentStatus::from_str(&status_str),
|
||||
last_heartbeat_at: last_heartbeat_at.parse().unwrap_or_default(),
|
||||
registered_at: registered_at.parse().unwrap_or_default(),
|
||||
metadata: serde_json::from_str(&metadata_str).unwrap_or_default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn store() -> (TempDir, EventStore) {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let db = dir.path().join("test.db");
|
||||
let store = EventStore::open(&db).unwrap();
|
||||
(dir, store)
|
||||
}
|
||||
|
||||
fn sample_task(task_id: &str, priority: Priority) -> Task {
|
||||
Task {
|
||||
task_id: task_id.to_string(),
|
||||
source: format!("forgejo:repo#{task_id}"),
|
||||
task_type: "code".into(),
|
||||
priority,
|
||||
status: TaskStatus::Created,
|
||||
assigned_agent_id: None,
|
||||
requirements: "do something".into(),
|
||||
labels: vec!["code:rust".into()],
|
||||
created_at: Utc::now(),
|
||||
assigned_at: None,
|
||||
started_at: None,
|
||||
completed_at: None,
|
||||
retry_count: 0,
|
||||
max_retries: 2,
|
||||
timeout_seconds: 60,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn append_and_query_events() {
|
||||
let (_dir, store) = store();
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: "task-1".into(),
|
||||
event_type: "task.created".into(),
|
||||
agent_id: None,
|
||||
timestamp: Utc::now(),
|
||||
payload: serde_json::json!({"ok": true}),
|
||||
};
|
||||
store.append_event_direct(&event).unwrap();
|
||||
let events = store.get_events_for_task("task-1").unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0].event_type, "task.created");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeout_detection_uses_per_task_timeout() {
|
||||
let (_dir, store) = store();
|
||||
let mut task = sample_task("task-timeout", Priority::Normal);
|
||||
task.status = TaskStatus::Running;
|
||||
task.started_at = Some(Utc::now() - chrono::Duration::seconds(120));
|
||||
task.timeout_seconds = 60;
|
||||
store.insert_task(&task).unwrap();
|
||||
|
||||
let timed_out = store.find_timed_out_tasks().unwrap();
|
||||
assert_eq!(timed_out, vec!["task-timeout".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dequeue_assigns_highest_priority_task() {
|
||||
let (_dir, mut store) = store();
|
||||
store.insert_task(&sample_task("low", Priority::Low)).unwrap();
|
||||
store.insert_task(&sample_task("urgent", Priority::Urgent)).unwrap();
|
||||
store.insert_task(&sample_task("high", Priority::High)).unwrap();
|
||||
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: String::new(),
|
||||
event_type: "task.assigned".into(),
|
||||
agent_id: Some("worker-01".into()),
|
||||
timestamp: Utc::now(),
|
||||
payload: serde_json::json!({"reason": "test"}),
|
||||
};
|
||||
|
||||
let task = store
|
||||
.dequeue_and_assign(&["code:rust".into()], Some("worker-01"), Utc::now().to_rfc3339(), &event)
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(task.task_id, "urgent");
|
||||
assert_eq!(task.status, TaskStatus::Assigned);
|
||||
|
||||
let events = store.get_events_for_task("urgent").unwrap();
|
||||
assert_eq!(events.len(), 1);
|
||||
assert_eq!(events[0].task_id, "urgent");
|
||||
status,
|
||||
execution_mode: ExecutionMode::from_str(&row.get::<_, String>(5)?),
|
||||
assigned_agent_id: row.get(6)?,
|
||||
assigned_host: row.get(7)?,
|
||||
requirements: row.get(8)?,
|
||||
labels: serde_json::from_str(&row.get::<_, String>(9)?).unwrap_or_default(),
|
||||
branch_name: row.get(10)?,
|
||||
pr_title: row.get(11)?,
|
||||
created_at: row.get::<_, String>(12)?.parse().unwrap_or_else(|_| Utc::now()),
|
||||
assigned_at: row.get::<_, Option<String>>(13)?.and_then(|s| s.parse().ok()),
|
||||
started_at: row.get::<_, Option<String>>(14)?.and_then(|s| s.parse().ok()),
|
||||
completed_at: row.get::<_, Option<String>>(15)?.and_then(|s| s.parse().ok()),
|
||||
last_activity_at: row.get::<_, Option<String>>(16)?.and_then(|s| s.parse().ok()),
|
||||
retry_count: row.get(17)?,
|
||||
max_retries: row.get(18)?,
|
||||
review_count: row.get(19)?,
|
||||
timeout_seconds: row.get(20)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -86,11 +86,35 @@ pub struct Agent {
|
|||
// ─── Task ────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ExecutionMode {
|
||||
SshCli,
|
||||
HttpPull,
|
||||
}
|
||||
|
||||
impl ExecutionMode {
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::SshCli => "ssh_cli",
|
||||
Self::HttpPull => "http_pull",
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_str(value: &str) -> Self {
|
||||
match value {
|
||||
"http_pull" => Self::HttpPull,
|
||||
_ => Self::SshCli,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum TaskStatus {
|
||||
Created,
|
||||
Assigned,
|
||||
Running,
|
||||
ReviewPending,
|
||||
Completed,
|
||||
Failed,
|
||||
AgentLost,
|
||||
|
|
@ -103,6 +127,7 @@ impl TaskStatus {
|
|||
Self::Created => "created",
|
||||
Self::Assigned => "assigned",
|
||||
Self::Running => "running",
|
||||
Self::ReviewPending => "review_pending",
|
||||
Self::Completed => "completed",
|
||||
Self::Failed => "failed",
|
||||
Self::AgentLost => "agent_lost",
|
||||
|
|
@ -147,15 +172,21 @@ pub struct Task {
|
|||
pub task_type: String,
|
||||
pub priority: Priority,
|
||||
pub status: TaskStatus,
|
||||
pub execution_mode: ExecutionMode,
|
||||
pub assigned_agent_id: Option<String>,
|
||||
pub assigned_host: Option<String>,
|
||||
pub requirements: String,
|
||||
pub labels: Vec<String>,
|
||||
pub branch_name: Option<String>,
|
||||
pub pr_title: Option<String>,
|
||||
pub created_at: DateTime<Utc>,
|
||||
pub assigned_at: Option<DateTime<Utc>>,
|
||||
pub started_at: Option<DateTime<Utc>>,
|
||||
pub completed_at: Option<DateTime<Utc>>,
|
||||
pub last_activity_at: Option<DateTime<Utc>>,
|
||||
pub retry_count: u32,
|
||||
pub max_retries: u32,
|
||||
pub review_count: u32,
|
||||
pub timeout_seconds: u64,
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ impl RetryPolicy {
|
|||
Self { sm, store }
|
||||
}
|
||||
|
||||
/// M5: Handle a failed task with a single atomic DB transaction.
|
||||
/// Handle a failed task with a single atomic DB transaction.
|
||||
/// Reads the task, checks retry limit, increments retry_count, and transitions
|
||||
/// to Assigned — all under one lock + transaction to prevent TOCTOU races.
|
||||
pub async fn handle_failure(
|
||||
|
|
@ -30,46 +30,48 @@ impl RetryPolicy {
|
|||
let store = self.store.clone();
|
||||
|
||||
let task_id_log = task_id.clone();
|
||||
let retry_result = tokio::task::spawn_blocking(move || -> Result<RetryDecision, StateError> {
|
||||
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
let retry_result =
|
||||
tokio::task::spawn_blocking(move || -> Result<RetryDecision, StateError> {
|
||||
let mut store =
|
||||
store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
|
||||
let now = chrono::Utc::now();
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: task_id.clone(),
|
||||
event_type: "task.assigned".into(),
|
||||
agent_id: None,
|
||||
timestamp: now,
|
||||
payload: serde_json::json!({
|
||||
"from_status": "failed",
|
||||
"to_status": "assigned",
|
||||
"reason": format!("retry: {reason}"),
|
||||
}),
|
||||
};
|
||||
let now = chrono::Utc::now();
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: task_id.clone(),
|
||||
event_type: "task.assigned".into(),
|
||||
agent_id: None,
|
||||
timestamp: now,
|
||||
payload: serde_json::json!({
|
||||
"from_status": "failed",
|
||||
"to_status": "assigned",
|
||||
"reason": format!("retry: {reason}"),
|
||||
}),
|
||||
};
|
||||
|
||||
let result = store.retry_and_transition(
|
||||
&task_id,
|
||||
TaskStatus::Assigned.as_str(),
|
||||
None,
|
||||
Some(now.to_rfc3339()),
|
||||
None,
|
||||
None,
|
||||
&event,
|
||||
)?;
|
||||
let result = store.retry_and_transition(
|
||||
&task_id,
|
||||
TaskStatus::Assigned.as_str(),
|
||||
None,
|
||||
Some(now.to_rfc3339()),
|
||||
None,
|
||||
None,
|
||||
&event,
|
||||
)?;
|
||||
|
||||
match result {
|
||||
Some((original, _updated)) => {
|
||||
let attempt = original.retry_count + 1;
|
||||
Ok(RetryDecision::Retried {
|
||||
attempt,
|
||||
max: original.max_retries,
|
||||
})
|
||||
match result {
|
||||
Some((original, _updated)) => {
|
||||
let attempt = original.retry_count + 1;
|
||||
Ok(RetryDecision::Retried {
|
||||
attempt,
|
||||
max: original.max_retries,
|
||||
})
|
||||
}
|
||||
None => Ok(RetryDecision::Exhausted),
|
||||
}
|
||||
None => Ok(RetryDecision::Exhausted),
|
||||
}
|
||||
})
|
||||
.await
|
||||
.map_err(StateError::Join)??;
|
||||
})
|
||||
.await
|
||||
.map_err(StateError::Join)??;
|
||||
|
||||
if matches!(retry_result, RetryDecision::Exhausted) {
|
||||
tracing::warn!(task_id = task_id_log, "max retries exceeded");
|
||||
|
|
@ -98,15 +100,21 @@ mod tests {
|
|||
task_type: "code".into(),
|
||||
priority: Priority::Normal,
|
||||
status: TaskStatus::Failed,
|
||||
execution_mode: ExecutionMode::SshCli,
|
||||
assigned_agent_id: Some("worker-01".into()),
|
||||
assigned_host: None,
|
||||
requirements: "do something".into(),
|
||||
labels: vec!["code:rust".into()],
|
||||
branch_name: None,
|
||||
pr_title: None,
|
||||
created_at: Utc::now(),
|
||||
assigned_at: Some(Utc::now()),
|
||||
started_at: Some(Utc::now()),
|
||||
completed_at: None,
|
||||
last_activity_at: None,
|
||||
retry_count,
|
||||
max_retries,
|
||||
review_count: 0,
|
||||
timeout_seconds: 60,
|
||||
}
|
||||
}
|
||||
|
|
@ -128,7 +136,10 @@ mod tests {
|
|||
store.insert_task(&sample_task("task-1", 0, 2)).unwrap();
|
||||
}
|
||||
|
||||
let result = policy.handle_failure("task-1", Some("worker-01"), "transient").await.unwrap();
|
||||
let result = policy
|
||||
.handle_failure("task-1", Some("worker-01"), "transient")
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result, RetryDecision::Retried { attempt: 1, max: 2 });
|
||||
}
|
||||
|
||||
|
|
@ -140,7 +151,10 @@ mod tests {
|
|||
store.insert_task(&sample_task("task-2", 2, 2)).unwrap();
|
||||
}
|
||||
|
||||
let result = policy.handle_failure("task-2", Some("worker-01"), "permanent").await.unwrap();
|
||||
let result = policy
|
||||
.handle_failure("task-2", Some("worker-01"), "permanent")
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(result, RetryDecision::Exhausted);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
use chrono::Utc;
|
||||
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use super::event_store::EventStore;
|
||||
|
|
@ -14,26 +13,36 @@ impl StateMachine {
|
|||
Self { store }
|
||||
}
|
||||
|
||||
/// C1 + C2: Single lock scope, spawn_blocking, transactional transition.
|
||||
pub async fn transition(
|
||||
&self,
|
||||
task_id: &str,
|
||||
new_status: TaskStatus,
|
||||
agent_id: Option<&str>,
|
||||
reason: &str,
|
||||
) -> Result<Task, StateError> {
|
||||
self.transition_with_host(task_id, new_status, agent_id, None, reason)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn transition_with_host(
|
||||
&self,
|
||||
task_id: &str,
|
||||
new_status: TaskStatus,
|
||||
agent_id: Option<&str>,
|
||||
assigned_host: Option<&str>,
|
||||
reason: &str,
|
||||
) -> Result<Task, StateError> {
|
||||
let task_id = task_id.to_string();
|
||||
let reason = reason.to_string();
|
||||
let agent_id_owned = agent_id.map(String::from);
|
||||
let host_owned = assigned_host.map(String::from);
|
||||
let store = self.store.clone();
|
||||
|
||||
tokio::task::spawn_blocking(move || -> Result<Task, StateError> {
|
||||
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
|
||||
let task = store
|
||||
.read_task(&task_id)?
|
||||
.ok_or_else(|| StateError::TaskNotFound(task_id.clone()))?;
|
||||
|
||||
Self::validate_transition(&task.status, &new_status)?;
|
||||
|
||||
let now = Utc::now();
|
||||
|
|
@ -47,6 +56,7 @@ impl StateMachine {
|
|||
"from_status": task.status.as_str(),
|
||||
"to_status": new_status.as_str(),
|
||||
"reason": reason,
|
||||
"assigned_host": host_owned,
|
||||
}),
|
||||
};
|
||||
|
||||
|
|
@ -54,24 +64,19 @@ impl StateMachine {
|
|||
&task_id,
|
||||
new_status.as_str(),
|
||||
agent_id_owned.as_deref(),
|
||||
if new_status == TaskStatus::Assigned {
|
||||
host_owned.as_deref(),
|
||||
if new_status == TaskStatus::Assigned { Some(now.to_rfc3339()) } else { None },
|
||||
if matches!(new_status, TaskStatus::Running | TaskStatus::ReviewPending) {
|
||||
Some(now.to_rfc3339())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
if new_status == TaskStatus::Running {
|
||||
Some(now.to_rfc3339())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
if matches!(
|
||||
new_status,
|
||||
TaskStatus::Completed | TaskStatus::Failed | TaskStatus::Cancelled
|
||||
) {
|
||||
if matches!(new_status, TaskStatus::Completed | TaskStatus::Failed | TaskStatus::Cancelled) {
|
||||
Some(now.to_rfc3339())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
new_status == TaskStatus::ReviewPending,
|
||||
&event,
|
||||
)?)
|
||||
})
|
||||
|
|
@ -82,22 +87,18 @@ impl StateMachine {
|
|||
pub async fn create_task(&self, task: &Task) -> Result<Task, StateError> {
|
||||
let task = task.clone();
|
||||
let store = self.store.clone();
|
||||
|
||||
tokio::task::spawn_blocking(move || -> Result<Task, StateError> {
|
||||
let store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
|
||||
store.insert_task(&task)?;
|
||||
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
task_id: task.task_id.clone(),
|
||||
event_type: "task.created".into(),
|
||||
agent_id: None,
|
||||
timestamp: Utc::now(),
|
||||
payload: serde_json::json!({ "source": task.source }),
|
||||
payload: serde_json::json!({ "source": task.source, "execution_mode": task.execution_mode.as_str() }),
|
||||
};
|
||||
store.append_event_direct(&event)?;
|
||||
|
||||
Ok(task)
|
||||
})
|
||||
.await
|
||||
|
|
@ -110,14 +111,17 @@ impl StateMachine {
|
|||
TaskStatus::Assigned => matches!(to, TaskStatus::Running | TaskStatus::Cancelled),
|
||||
TaskStatus::Running => matches!(
|
||||
to,
|
||||
TaskStatus::Completed
|
||||
TaskStatus::ReviewPending
|
||||
| TaskStatus::Completed
|
||||
| TaskStatus::Failed
|
||||
| TaskStatus::AgentLost
|
||||
| TaskStatus::Cancelled
|
||||
),
|
||||
TaskStatus::Failed | TaskStatus::AgentLost => {
|
||||
matches!(to, TaskStatus::Assigned | TaskStatus::Cancelled)
|
||||
}
|
||||
TaskStatus::ReviewPending => matches!(
|
||||
to,
|
||||
TaskStatus::Assigned | TaskStatus::Running | TaskStatus::Completed | TaskStatus::Failed | TaskStatus::Cancelled
|
||||
),
|
||||
TaskStatus::Failed | TaskStatus::AgentLost => matches!(to, TaskStatus::Assigned | TaskStatus::Cancelled),
|
||||
TaskStatus::Completed | TaskStatus::Cancelled => false,
|
||||
};
|
||||
if !valid {
|
||||
|
|
@ -131,9 +135,9 @@ impl StateMachine {
|
|||
|
||||
pub fn parse_status(s: &str) -> TaskStatus {
|
||||
match s {
|
||||
"created" => TaskStatus::Created,
|
||||
"assigned" => TaskStatus::Assigned,
|
||||
"running" => TaskStatus::Running,
|
||||
"review_pending" => TaskStatus::ReviewPending,
|
||||
"completed" => TaskStatus::Completed,
|
||||
"failed" => TaskStatus::Failed,
|
||||
"agent_lost" => TaskStatus::AgentLost,
|
||||
|
|
@ -156,61 +160,3 @@ pub enum StateError {
|
|||
#[error("mutex poisoned: {0}")]
|
||||
Poisoned(String),
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn sample_task(task_id: &str) -> Task {
|
||||
Task {
|
||||
task_id: task_id.to_string(),
|
||||
source: format!("forgejo:repo#{task_id}"),
|
||||
task_type: "code".into(),
|
||||
priority: Priority::Normal,
|
||||
status: TaskStatus::Created,
|
||||
assigned_agent_id: None,
|
||||
requirements: "do something".into(),
|
||||
labels: vec!["code:rust".into()],
|
||||
created_at: Utc::now(),
|
||||
assigned_at: None,
|
||||
started_at: None,
|
||||
completed_at: None,
|
||||
retry_count: 0,
|
||||
max_retries: 2,
|
||||
timeout_seconds: 60,
|
||||
}
|
||||
}
|
||||
|
||||
fn test_sm() -> (TempDir, StateMachine) {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let db = dir.path().join("test.db");
|
||||
let store = EventStore::open(&db).unwrap();
|
||||
let sm = StateMachine::new(Arc::new(Mutex::new(store)));
|
||||
(dir, sm)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn happy_path_transitions() {
|
||||
let (_dir, sm) = test_sm();
|
||||
sm.create_task(&sample_task("task-1")).await.unwrap();
|
||||
|
||||
let assigned = sm.transition("task-1", TaskStatus::Assigned, Some("worker-01"), "assigned").await.unwrap();
|
||||
assert_eq!(assigned.status, TaskStatus::Assigned);
|
||||
|
||||
let running = sm.transition("task-1", TaskStatus::Running, Some("worker-01"), "started").await.unwrap();
|
||||
assert_eq!(running.status, TaskStatus::Running);
|
||||
|
||||
let completed = sm.transition("task-1", TaskStatus::Completed, Some("worker-01"), "done").await.unwrap();
|
||||
assert_eq!(completed.status, TaskStatus::Completed);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn invalid_transition_rejected() {
|
||||
let (_dir, sm) = test_sm();
|
||||
sm.create_task(&sample_task("task-2")).await.unwrap();
|
||||
|
||||
let err = sm.transition("task-2", TaskStatus::Completed, Some("worker-01"), "skip").await.unwrap_err();
|
||||
assert!(matches!(err, StateError::InvalidTransition(_, _)));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ use super::event_store::EventStore;
|
|||
use super::models::*;
|
||||
use super::state_machine::{StateError, StateMachine};
|
||||
|
||||
/// Global task queue ordered by priority.
|
||||
pub struct TaskQueue {
|
||||
sm: Arc<StateMachine>,
|
||||
store: Arc<Mutex<EventStore>>,
|
||||
|
|
@ -15,15 +14,11 @@ impl TaskQueue {
|
|||
Self { sm, store }
|
||||
}
|
||||
|
||||
/// Enqueue a new task (status = created).
|
||||
pub async fn enqueue(&self, task: Task) -> Result<Task, StateError> {
|
||||
self.sm.create_task(&task).await
|
||||
}
|
||||
|
||||
/// M8: Dequeue the highest-priority task matching capabilities.
|
||||
/// Atomically transitions to `Assigned` inside a single DB transaction
|
||||
/// via `dequeue_and_assign`, preventing concurrent dequeue of the same task.
|
||||
pub async fn dequeue(
|
||||
pub async fn dequeue_http_pull(
|
||||
&self,
|
||||
required_capabilities: &[String],
|
||||
agent_id: Option<&str>,
|
||||
|
|
@ -35,10 +30,8 @@ impl TaskQueue {
|
|||
tokio::task::spawn_blocking(move || -> Result<Option<Task>, StateError> {
|
||||
let mut store = store.lock().map_err(|e| StateError::Poisoned(e.to_string()))?;
|
||||
let now = chrono::Utc::now();
|
||||
|
||||
let event = TaskEvent {
|
||||
event_id: uuid::Uuid::new_v4().to_string(),
|
||||
// task_id filled inside dequeue_and_assign
|
||||
task_id: String::new(),
|
||||
event_type: "task.assigned".into(),
|
||||
agent_id: agent_id_owned.clone(),
|
||||
|
|
@ -47,10 +40,10 @@ impl TaskQueue {
|
|||
"from_status": "created",
|
||||
"to_status": "assigned",
|
||||
"reason": "dequeued",
|
||||
"execution_mode": "http_pull"
|
||||
}),
|
||||
};
|
||||
|
||||
Ok(store.dequeue_and_assign(
|
||||
Ok(store.dequeue_and_assign_http_pull(
|
||||
&caps,
|
||||
agent_id_owned.as_deref(),
|
||||
now.to_rfc3339(),
|
||||
|
|
@ -61,63 +54,9 @@ impl TaskQueue {
|
|||
.map_err(StateError::Join)?
|
||||
}
|
||||
|
||||
/// Re-queue a failed/agent_lost task (delegates to state machine transition).
|
||||
pub async fn requeue(&self, task_id: &str) -> Result<Task, StateError> {
|
||||
self.sm
|
||||
.transition(task_id, TaskStatus::Assigned, None, "re-queued after failure")
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use chrono::Utc;
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn sample_task(task_id: &str, priority: Priority) -> Task {
|
||||
Task {
|
||||
task_id: task_id.to_string(),
|
||||
source: format!("forgejo:repo#{task_id}"),
|
||||
task_type: "code".into(),
|
||||
priority,
|
||||
status: TaskStatus::Created,
|
||||
assigned_agent_id: None,
|
||||
requirements: "do something".into(),
|
||||
labels: vec!["code:rust".into()],
|
||||
created_at: Utc::now(),
|
||||
assigned_at: None,
|
||||
started_at: None,
|
||||
completed_at: None,
|
||||
retry_count: 0,
|
||||
max_retries: 2,
|
||||
timeout_seconds: 60,
|
||||
}
|
||||
}
|
||||
|
||||
fn test_queue() -> (TempDir, TaskQueue) {
|
||||
let dir = TempDir::new().unwrap();
|
||||
let db = dir.path().join("test.db");
|
||||
let store = Arc::new(Mutex::new(EventStore::open(&db).unwrap()));
|
||||
let sm = Arc::new(StateMachine::new(store.clone()));
|
||||
let queue = TaskQueue::new(sm, store);
|
||||
(dir, queue)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn dequeues_by_priority() {
|
||||
let (_dir, queue) = test_queue();
|
||||
queue.enqueue(sample_task("low", Priority::Low)).await.unwrap();
|
||||
queue.enqueue(sample_task("urgent", Priority::Urgent)).await.unwrap();
|
||||
queue.enqueue(sample_task("high", Priority::High)).await.unwrap();
|
||||
|
||||
let task = queue
|
||||
.dequeue(&["code:rust".into()], Some("worker-01"))
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(task.task_id, "urgent");
|
||||
assert_eq!(task.status, TaskStatus::Assigned);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -40,17 +40,17 @@ impl TimeoutChecker {
|
|||
}
|
||||
}
|
||||
|
||||
/// M6: Uses per-task `timeout_seconds` from the DB instead of a global timeout.
|
||||
/// Uses per-task `timeout_seconds` from the DB instead of a global timeout.
|
||||
pub async fn check_timeouts(&self) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
|
||||
let timed_out = {
|
||||
let store = self.store.lock().map_err(|e| e.to_string())?;
|
||||
store.find_timed_out_tasks()?
|
||||
};
|
||||
|
||||
for task_id in timed_out {
|
||||
for task_id in &timed_out {
|
||||
match self
|
||||
.sm
|
||||
.transition(&task_id, TaskStatus::Failed, None, "timeout")
|
||||
.transition(task_id, TaskStatus::Failed, None, "timeout")
|
||||
.await
|
||||
{
|
||||
Ok(_) => tracing::warn!(task_id = task_id, "task timed out"),
|
||||
|
|
@ -74,15 +74,21 @@ mod tests {
|
|||
task_type: "code".into(),
|
||||
priority: Priority::Normal,
|
||||
status: TaskStatus::Running,
|
||||
execution_mode: ExecutionMode::SshCli,
|
||||
assigned_agent_id: Some("worker-01".into()),
|
||||
assigned_host: None,
|
||||
requirements: "do something".into(),
|
||||
labels: vec!["code:rust".into()],
|
||||
branch_name: None,
|
||||
pr_title: None,
|
||||
created_at: Utc::now(),
|
||||
assigned_at: Some(Utc::now()),
|
||||
started_at: Some(Utc::now() - chrono::Duration::seconds(120)),
|
||||
completed_at: None,
|
||||
last_activity_at: None,
|
||||
retry_count: 0,
|
||||
max_retries: 2,
|
||||
review_count: 0,
|
||||
timeout_seconds: 60,
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue