feat: transform to REST API service with SQLite persistence (v0.3.0)

Major architecture transformation from batch-only to API service with
database persistence for Windmill integration.

## REST API Implementation
- POST /simulate/trigger - Start simulation jobs
- GET /simulate/status/{job_id} - Monitor job progress
- GET /results - Query results with filters (job_id, date, model)
- GET /health - Service health checks

## Database Layer
- SQLite persistence with 6 tables (jobs, job_details, positions,
  holdings, reasoning_logs, tool_usage)
- Foreign key constraints with cascade deletes
- Replaces JSONL file storage

## Backend Components
- JobManager: Job lifecycle management with concurrency control
- RuntimeConfigManager: Thread-safe isolated runtime configs
- ModelDayExecutor: Single model-day execution engine
- SimulationWorker: Date-sequential, model-parallel orchestration

## Testing
- 102 unit and integration tests (85% coverage)
- Database: 98% coverage
- Job manager: 98% coverage
- API endpoints: 81% coverage
- Pydantic models: 100% coverage
- TDD approach throughout

## Docker Deployment
- Dual-mode: API server (persistent) + batch (one-time)
- Health checks with 30s interval
- Volume persistence for database and logs
- Separate entrypoints for each mode

## Validation Tools
- scripts/validate_docker_build.sh - Build validation
- scripts/test_api_endpoints.sh - Complete API testing
- scripts/test_batch_mode.sh - Batch mode validation
- DOCKER_API.md - Deployment guide
- TESTING_GUIDE.md - Testing procedures

## Configuration
- API_PORT environment variable (default: 8080)
- Backwards compatible with existing configs
- FastAPI, uvicorn, pydantic>=2.0 dependencies

Co-Authored-By: AI Assistant <noreply@example.com>
This commit is contained in:
2025-10-31 11:47:10 -04:00
parent 5da02b4ba0
commit fb9583b374
45 changed files with 13775 additions and 18 deletions

625
api/job_manager.py Normal file
View File

@@ -0,0 +1,625 @@
"""
Job lifecycle manager for simulation orchestration.
This module provides:
- Job creation and validation
- Status transitions (state machine)
- Progress tracking across model-days
- Concurrency control (single job at a time)
- Job retrieval and queries
- Cleanup operations
"""
import sqlite3
import json
import uuid
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Any
from pathlib import Path
import logging
from api.database import get_db_connection
logger = logging.getLogger(__name__)
class JobManager:
"""
Manages simulation job lifecycle and orchestration.
Responsibilities:
- Create jobs with date ranges and model lists
- Track job status (pending → running → completed/partial/failed)
- Monitor progress across model-days
- Enforce single-job concurrency
- Provide job queries and retrieval
- Cleanup old jobs
State Machine:
pending → running → completed (all succeeded)
→ partial (some failed)
→ failed (job-level error)
"""
def __init__(self, db_path: str = "data/jobs.db"):
"""
Initialize JobManager.
Args:
db_path: Path to SQLite database
"""
self.db_path = db_path
def create_job(
self,
config_path: str,
date_range: List[str],
models: List[str]
) -> str:
"""
Create new simulation job.
Args:
config_path: Path to configuration file
date_range: List of dates to simulate (YYYY-MM-DD)
models: List of model signatures to execute
Returns:
job_id: UUID of created job
Raises:
ValueError: If another job is already running/pending
"""
if not self.can_start_new_job():
raise ValueError("Another simulation job is already running or pending")
job_id = str(uuid.uuid4())
created_at = datetime.utcnow().isoformat() + "Z"
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
# Insert job
cursor.execute("""
INSERT INTO jobs (
job_id, config_path, status, date_range, models, created_at
)
VALUES (?, ?, ?, ?, ?, ?)
""", (
job_id,
config_path,
"pending",
json.dumps(date_range),
json.dumps(models),
created_at
))
# Create job_details for each model-day combination
for date in date_range:
for model in models:
cursor.execute("""
INSERT INTO job_details (
job_id, date, model, status
)
VALUES (?, ?, ?, ?)
""", (job_id, date, model, "pending"))
conn.commit()
logger.info(f"Created job {job_id} with {len(date_range)} dates and {len(models)} models")
return job_id
finally:
conn.close()
def get_job(self, job_id: str) -> Optional[Dict[str, Any]]:
"""
Get job by ID.
Args:
job_id: Job UUID
Returns:
Job data dict or None if not found
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT
job_id, config_path, status, date_range, models,
created_at, started_at, updated_at, completed_at,
total_duration_seconds, error
FROM jobs
WHERE job_id = ?
""", (job_id,))
row = cursor.fetchone()
if not row:
return None
return {
"job_id": row[0],
"config_path": row[1],
"status": row[2],
"date_range": json.loads(row[3]),
"models": json.loads(row[4]),
"created_at": row[5],
"started_at": row[6],
"updated_at": row[7],
"completed_at": row[8],
"total_duration_seconds": row[9],
"error": row[10]
}
finally:
conn.close()
def get_current_job(self) -> Optional[Dict[str, Any]]:
"""
Get most recent job.
Returns:
Most recent job data or None if no jobs exist
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT
job_id, config_path, status, date_range, models,
created_at, started_at, updated_at, completed_at,
total_duration_seconds, error
FROM jobs
ORDER BY created_at DESC
LIMIT 1
""")
row = cursor.fetchone()
if not row:
return None
return {
"job_id": row[0],
"config_path": row[1],
"status": row[2],
"date_range": json.loads(row[3]),
"models": json.loads(row[4]),
"created_at": row[5],
"started_at": row[6],
"updated_at": row[7],
"completed_at": row[8],
"total_duration_seconds": row[9],
"error": row[10]
}
finally:
conn.close()
def find_job_by_date_range(self, date_range: List[str]) -> Optional[Dict[str, Any]]:
"""
Find job with matching date range.
Args:
date_range: List of dates to match
Returns:
Job data or None if not found
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
date_range_json = json.dumps(date_range)
cursor.execute("""
SELECT
job_id, config_path, status, date_range, models,
created_at, started_at, updated_at, completed_at,
total_duration_seconds, error
FROM jobs
WHERE date_range = ?
ORDER BY created_at DESC
LIMIT 1
""", (date_range_json,))
row = cursor.fetchone()
if not row:
return None
return {
"job_id": row[0],
"config_path": row[1],
"status": row[2],
"date_range": json.loads(row[3]),
"models": json.loads(row[4]),
"created_at": row[5],
"started_at": row[6],
"updated_at": row[7],
"completed_at": row[8],
"total_duration_seconds": row[9],
"error": row[10]
}
finally:
conn.close()
def update_job_status(
self,
job_id: str,
status: str,
error: Optional[str] = None
) -> None:
"""
Update job status.
Args:
job_id: Job UUID
status: New status (pending/running/completed/partial/failed)
error: Optional error message
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
updated_at = datetime.utcnow().isoformat() + "Z"
# Set timestamps based on status
if status == "running":
cursor.execute("""
UPDATE jobs
SET status = ?, started_at = ?, updated_at = ?
WHERE job_id = ?
""", (status, updated_at, updated_at, job_id))
elif status in ("completed", "partial", "failed"):
# Calculate duration
cursor.execute("""
SELECT started_at FROM jobs WHERE job_id = ?
""", (job_id,))
row = cursor.fetchone()
duration_seconds = None
if row and row[0]:
started_at = datetime.fromisoformat(row[0].replace("Z", ""))
completed_at = datetime.fromisoformat(updated_at.replace("Z", ""))
duration_seconds = (completed_at - started_at).total_seconds()
cursor.execute("""
UPDATE jobs
SET status = ?, completed_at = ?, updated_at = ?,
total_duration_seconds = ?, error = ?
WHERE job_id = ?
""", (status, updated_at, updated_at, duration_seconds, error, job_id))
else:
# Just update status
cursor.execute("""
UPDATE jobs
SET status = ?, updated_at = ?, error = ?
WHERE job_id = ?
""", (status, updated_at, error, job_id))
conn.commit()
logger.debug(f"Updated job {job_id} status to {status}")
finally:
conn.close()
def update_job_detail_status(
self,
job_id: str,
date: str,
model: str,
status: str,
error: Optional[str] = None
) -> None:
"""
Update model-day status and auto-update job status.
Args:
job_id: Job UUID
date: Trading date (YYYY-MM-DD)
model: Model signature
status: New status (pending/running/completed/failed)
error: Optional error message
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
updated_at = datetime.utcnow().isoformat() + "Z"
if status == "running":
cursor.execute("""
UPDATE job_details
SET status = ?, started_at = ?
WHERE job_id = ? AND date = ? AND model = ?
""", (status, updated_at, job_id, date, model))
# Update job to running if not already
cursor.execute("""
UPDATE jobs
SET status = 'running', started_at = COALESCE(started_at, ?), updated_at = ?
WHERE job_id = ? AND status = 'pending'
""", (updated_at, updated_at, job_id))
elif status in ("completed", "failed"):
# Calculate duration for detail
cursor.execute("""
SELECT started_at FROM job_details
WHERE job_id = ? AND date = ? AND model = ?
""", (job_id, date, model))
row = cursor.fetchone()
duration_seconds = None
if row and row[0]:
started_at = datetime.fromisoformat(row[0].replace("Z", ""))
completed_at = datetime.fromisoformat(updated_at.replace("Z", ""))
duration_seconds = (completed_at - started_at).total_seconds()
cursor.execute("""
UPDATE job_details
SET status = ?, completed_at = ?, duration_seconds = ?, error = ?
WHERE job_id = ? AND date = ? AND model = ?
""", (status, updated_at, duration_seconds, error, job_id, date, model))
# Check if all details are done
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM job_details
WHERE job_id = ?
""", (job_id,))
total, completed, failed = cursor.fetchone()
if completed + failed == total:
# All done - determine final status
if failed == 0:
final_status = "completed"
elif completed > 0:
final_status = "partial"
else:
final_status = "failed"
# Calculate job duration
cursor.execute("""
SELECT started_at FROM jobs WHERE job_id = ?
""", (job_id,))
row = cursor.fetchone()
job_duration = None
if row and row[0]:
started_at = datetime.fromisoformat(row[0].replace("Z", ""))
completed_at = datetime.fromisoformat(updated_at.replace("Z", ""))
job_duration = (completed_at - started_at).total_seconds()
cursor.execute("""
UPDATE jobs
SET status = ?, completed_at = ?, updated_at = ?, total_duration_seconds = ?
WHERE job_id = ?
""", (final_status, updated_at, updated_at, job_duration, job_id))
conn.commit()
logger.debug(f"Updated job_detail {job_id}/{date}/{model} to {status}")
finally:
conn.close()
def get_job_details(self, job_id: str) -> List[Dict[str, Any]]:
"""
Get all model-day execution details for a job.
Args:
job_id: Job UUID
Returns:
List of job_detail records with date, model, status, error
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT date, model, status, error, started_at, completed_at, duration_seconds
FROM job_details
WHERE job_id = ?
ORDER BY date, model
""", (job_id,))
rows = cursor.fetchall()
details = []
for row in rows:
details.append({
"date": row[0],
"model": row[1],
"status": row[2],
"error": row[3],
"started_at": row[4],
"completed_at": row[5],
"duration_seconds": row[6]
})
return details
finally:
conn.close()
def get_job_progress(self, job_id: str) -> Dict[str, Any]:
"""
Get job progress summary.
Args:
job_id: Job UUID
Returns:
Progress dict with total_model_days, completed, failed, current, details
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM job_details
WHERE job_id = ?
""", (job_id,))
total, completed, failed = cursor.fetchone()
# Get currently running model-day
cursor.execute("""
SELECT date, model
FROM job_details
WHERE job_id = ? AND status = 'running'
LIMIT 1
""", (job_id,))
current_row = cursor.fetchone()
current = {"date": current_row[0], "model": current_row[1]} if current_row else None
# Get all details
cursor.execute("""
SELECT date, model, status, duration_seconds, error
FROM job_details
WHERE job_id = ?
ORDER BY date, model
""", (job_id,))
details = []
for row in cursor.fetchall():
details.append({
"date": row[0],
"model": row[1],
"status": row[2],
"duration_seconds": row[3],
"error": row[4]
})
return {
"total_model_days": total,
"completed": completed or 0,
"failed": failed or 0,
"current": current,
"details": details
}
finally:
conn.close()
def can_start_new_job(self) -> bool:
"""
Check if new job can be started.
Returns:
True if no jobs are pending/running, False otherwise
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT COUNT(*)
FROM jobs
WHERE status IN ('pending', 'running')
""")
count = cursor.fetchone()[0]
return count == 0
finally:
conn.close()
def get_running_jobs(self) -> List[Dict[str, Any]]:
"""
Get all running/pending jobs.
Returns:
List of job dicts
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cursor.execute("""
SELECT
job_id, config_path, status, date_range, models,
created_at, started_at, updated_at, completed_at,
total_duration_seconds, error
FROM jobs
WHERE status IN ('pending', 'running')
ORDER BY created_at DESC
""")
jobs = []
for row in cursor.fetchall():
jobs.append({
"job_id": row[0],
"config_path": row[1],
"status": row[2],
"date_range": json.loads(row[3]),
"models": json.loads(row[4]),
"created_at": row[5],
"started_at": row[6],
"updated_at": row[7],
"completed_at": row[8],
"total_duration_seconds": row[9],
"error": row[10]
})
return jobs
finally:
conn.close()
def cleanup_old_jobs(self, days: int = 30) -> Dict[str, int]:
"""
Delete jobs older than threshold.
Args:
days: Delete jobs older than this many days
Returns:
Dict with jobs_deleted count
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
cutoff_date = (datetime.utcnow() - timedelta(days=days)).isoformat() + "Z"
# Get count before deletion
cursor.execute("""
SELECT COUNT(*)
FROM jobs
WHERE created_at < ? AND status IN ('completed', 'partial', 'failed')
""", (cutoff_date,))
count = cursor.fetchone()[0]
# Delete old jobs (foreign key cascade will delete related records)
cursor.execute("""
DELETE FROM jobs
WHERE created_at < ? AND status IN ('completed', 'partial', 'failed')
""", (cutoff_date,))
conn.commit()
logger.info(f"Cleaned up {count} jobs older than {days} days")
return {"jobs_deleted": count}
finally:
conn.close()