ArchiveBox/archivebox/workers/pid_utils.py
2025-12-24 20:10:38 -08:00

192 lines
5.2 KiB
Python

"""
PID file utilities for tracking worker and orchestrator processes.
PID files are stored in data/tmp/workers/ and contain:
- Line 1: PID
- Line 2: Worker type (orchestrator, crawl, snapshot, archiveresult)
- Line 3: Extractor filter (optional, for archiveresult workers)
- Line 4: Started at ISO timestamp
"""
__package__ = 'archivebox.workers'
import os
import signal
from pathlib import Path
from datetime import datetime, timezone
from django.conf import settings
def get_pid_dir() -> Path:
"""Get the directory for PID files, creating it if needed."""
pid_dir = Path(settings.DATA_DIR) / 'tmp' / 'workers'
pid_dir.mkdir(parents=True, exist_ok=True)
return pid_dir
def write_pid_file(worker_type: str, worker_id: int = 0, extractor: str | None = None) -> Path:
"""
Write a PID file for the current process.
Returns the path to the PID file.
"""
pid_dir = get_pid_dir()
if worker_type == 'orchestrator':
pid_file = pid_dir / 'orchestrator.pid'
else:
pid_file = pid_dir / f'{worker_type}_worker_{worker_id}.pid'
content = f"{os.getpid()}\n{worker_type}\n{extractor or ''}\n{datetime.now(timezone.utc).isoformat()}\n"
pid_file.write_text(content)
return pid_file
def read_pid_file(path: Path) -> dict | None:
"""
Read and parse a PID file.
Returns dict with pid, worker_type, extractor, started_at or None if invalid.
"""
try:
if not path.exists():
return None
lines = path.read_text().strip().split('\n')
if len(lines) < 4:
return None
return {
'pid': int(lines[0]),
'worker_type': lines[1],
'extractor': lines[2] or None,
'started_at': datetime.fromisoformat(lines[3]),
'pid_file': path,
}
except (ValueError, IndexError, OSError):
return None
def remove_pid_file(path: Path) -> None:
"""Remove a PID file if it exists."""
try:
path.unlink(missing_ok=True)
except OSError:
pass
def is_process_alive(pid: int) -> bool:
"""Check if a process with the given PID is still running."""
try:
os.kill(pid, 0) # Signal 0 doesn't kill, just checks
return True
except (OSError, ProcessLookupError):
return False
def get_all_pid_files() -> list[Path]:
"""Get all PID files in the workers directory."""
pid_dir = get_pid_dir()
return list(pid_dir.glob('*.pid'))
def get_all_worker_pids(worker_type: str | None = None) -> list[dict]:
"""
Get info about all running workers.
Optionally filter by worker_type.
"""
workers = []
for pid_file in get_all_pid_files():
info = read_pid_file(pid_file)
if info is None:
continue
# Skip if process is dead
if not is_process_alive(info['pid']):
continue
# Filter by type if specified
if worker_type and info['worker_type'] != worker_type:
continue
workers.append(info)
return workers
def cleanup_stale_pid_files() -> int:
"""
Remove PID files for processes that are no longer running.
Returns the number of stale files removed.
"""
removed = 0
for pid_file in get_all_pid_files():
info = read_pid_file(pid_file)
if info is None:
# Invalid PID file, remove it
remove_pid_file(pid_file)
removed += 1
continue
if not is_process_alive(info['pid']):
remove_pid_file(pid_file)
removed += 1
return removed
def get_running_worker_count(worker_type: str) -> int:
"""Get the count of running workers of a specific type."""
return len(get_all_worker_pids(worker_type))
def get_next_worker_id(worker_type: str) -> int:
"""Get the next available worker ID for a given type."""
existing_ids = set()
for pid_file in get_all_pid_files():
# Parse worker ID from filename like "snapshot_worker_3.pid"
name = pid_file.stem
if name.startswith(f'{worker_type}_worker_'):
try:
worker_id = int(name.split('_')[-1])
existing_ids.add(worker_id)
except ValueError:
continue
# Find the lowest unused ID
next_id = 0
while next_id in existing_ids:
next_id += 1
return next_id
def stop_worker(pid: int, graceful: bool = True) -> bool:
"""
Stop a worker process.
If graceful=True, sends SIGTERM first, then SIGKILL after timeout.
Returns True if process was stopped.
"""
if not is_process_alive(pid):
return True
try:
if graceful:
os.kill(pid, signal.SIGTERM)
# Give it a moment to shut down
import time
for _ in range(10): # Wait up to 1 second
time.sleep(0.1)
if not is_process_alive(pid):
return True
# Force kill if still running
os.kill(pid, signal.SIGKILL)
else:
os.kill(pid, signal.SIGKILL)
return True
except (OSError, ProcessLookupError):
return True # Process already dead