fix: cleanup stale jobs on container restart to unblock new job creation

When a Docker container is shutdown and restarted, jobs with status
'pending', 'downloading_data', or 'running' remained in the database,
preventing new jobs from starting due to concurrency control checks.

This commit adds automatic cleanup of stale jobs during FastAPI startup:

- New cleanup_stale_jobs() method in JobManager (api/job_manager.py:702-779)
- Integrated into FastAPI lifespan startup (api/main.py:164-168)
- Intelligent status determination based on completion percentage:
  - 'partial' if any model-days completed (preserves progress data)
  - 'failed' if no progress made
- Detailed error messages with original status and completion counts
- Marks incomplete job_details as 'failed' with clear error messages
- Deployment-aware: skips cleanup in DEV mode when DB is reset
- Comprehensive logging at warning level for visibility

Testing:
- 6 new unit tests covering all cleanup scenarios (451-609)
- All 30 existing job_manager tests still pass
- Tests verify pending, running, downloading_data, partial progress,
  no stale jobs, and multiple stale jobs scenarios

Resolves issue where container restarts left stale jobs blocking the
can_start_new_job() concurrency check.
This commit is contained in:
2025-11-06 21:24:45 -05:00
parent 6ddc5abede
commit 406bb281b2
4 changed files with 269 additions and 7 deletions

View File

@@ -699,6 +699,85 @@ class JobManager:
finally:
conn.close()
def cleanup_stale_jobs(self) -> Dict[str, int]:
"""
Clean up stale jobs from container restarts.
Marks jobs with status 'pending', 'downloading_data', or 'running' as
'failed' or 'partial' based on completion percentage.
Called on application startup to reset interrupted jobs.
Returns:
Dict with jobs_cleaned count and details
"""
conn = get_db_connection(self.db_path)
cursor = conn.cursor()
try:
# Find all stale jobs
cursor.execute("""
SELECT job_id, status
FROM jobs
WHERE status IN ('pending', 'downloading_data', 'running')
""")
stale_jobs = cursor.fetchall()
cleaned_count = 0
for job_id, original_status in stale_jobs:
# Get progress to determine if partially completed
cursor.execute("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM job_details
WHERE job_id = ?
""", (job_id,))
total, completed, failed = cursor.fetchone()
completed = completed or 0
failed = failed or 0
# Determine final status based on completion
if completed > 0:
new_status = "partial"
error_msg = f"Job interrupted by container restart (was {original_status}, {completed}/{total} model-days completed)"
else:
new_status = "failed"
error_msg = f"Job interrupted by container restart (was {original_status}, no progress made)"
# Mark incomplete job_details as failed
cursor.execute("""
UPDATE job_details
SET status = 'failed', error = 'Container restarted before completion'
WHERE job_id = ? AND status IN ('pending', 'running')
""", (job_id,))
# Update job status
updated_at = datetime.utcnow().isoformat() + "Z"
cursor.execute("""
UPDATE jobs
SET status = ?, error = ?, completed_at = ?, updated_at = ?
WHERE job_id = ?
""", (new_status, error_msg, updated_at, updated_at, job_id))
logger.warning(f"Cleaned up stale job {job_id}: {original_status}{new_status} ({completed}/{total} completed)")
cleaned_count += 1
conn.commit()
if cleaned_count > 0:
logger.warning(f"⚠️ Cleaned up {cleaned_count} stale job(s) from previous container session")
else:
logger.info("✅ No stale jobs found")
return {"jobs_cleaned": cleaned_count}
finally:
conn.close()
def cleanup_old_jobs(self, days: int = 30) -> Dict[str, int]:
"""
Delete jobs older than threshold.

View File

@@ -134,25 +134,39 @@ def create_app(
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Initialize database on startup, cleanup on shutdown if needed"""
from tools.deployment_config import is_dev_mode, get_db_path
from tools.deployment_config import is_dev_mode, get_db_path, should_preserve_dev_data
from api.database import initialize_dev_database, initialize_database
# Startup - use closure to access db_path from create_app scope
logger.info("🚀 FastAPI application starting...")
logger.info("📊 Initializing database...")
should_cleanup_stale_jobs = False
if is_dev_mode():
# Initialize dev database (reset unless PRESERVE_DEV_DATA=true)
logger.info(" 🔧 DEV mode detected - initializing dev database")
dev_db_path = get_db_path(db_path)
initialize_dev_database(dev_db_path)
log_dev_mode_startup_warning()
# Only cleanup stale jobs if preserving dev data (otherwise DB is fresh)
if should_preserve_dev_data():
should_cleanup_stale_jobs = True
else:
# Ensure production database schema exists
logger.info(" 🏭 PROD mode - ensuring database schema exists")
initialize_database(db_path)
should_cleanup_stale_jobs = True
logger.info("✅ Database initialized")
# Clean up stale jobs from previous container session
if should_cleanup_stale_jobs:
logger.info("🧹 Checking for stale jobs from previous session...")
job_manager = JobManager(get_db_path(db_path) if is_dev_mode() else db_path)
job_manager.cleanup_stale_jobs()
logger.info("🌐 API server ready to accept requests")
yield