Persist running jobs to registry for TUI reconnection

Jobs are now saved to gniza-jobs.json in WORK_DIR when they start and
finish. On TUI restart, the registry is loaded and PIDs are checked —
still-running jobs appear in the Running Tasks screen and can be
killed. Reconnected jobs are polled every second to detect completion.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shuki
2026-03-06 19:27:37 +02:00
parent 276b49ea0a
commit ba93a74f8e
2 changed files with 123 additions and 0 deletions

View File

@@ -1,9 +1,11 @@
import asyncio import asyncio
import json
import os import os
import signal import signal
import uuid import uuid
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime from datetime import datetime
from pathlib import Path
from textual.message import Message from textual.message import Message
@@ -12,6 +14,16 @@ from tui.backend import start_cli_process
MAX_OUTPUT_LINES = 10_000 MAX_OUTPUT_LINES = 10_000
def _work_dir() -> Path:
if os.geteuid() == 0:
return Path("/usr/local/gniza/workdir")
state_home = os.environ.get("XDG_STATE_HOME", str(Path.home() / ".local" / "state"))
return Path(state_home) / "gniza" / "workdir"
REGISTRY_FILE = _work_dir() / "gniza-jobs.json"
class JobFinished(Message): class JobFinished(Message):
def __init__(self, job_id: str, return_code: int) -> None: def __init__(self, job_id: str, return_code: int) -> None:
super().__init__() super().__init__()
@@ -30,12 +42,16 @@ class Job:
return_code: int | None = None return_code: int | None = None
output: list[str] = field(default_factory=list) output: list[str] = field(default_factory=list)
_proc: asyncio.subprocess.Process | None = field(default=None, repr=False) _proc: asyncio.subprocess.Process | None = field(default=None, repr=False)
_pid: int | None = field(default=None, repr=False)
_pgid: int | None = field(default=None, repr=False)
_reconnected: bool = field(default=False, repr=False)
class JobManager: class JobManager:
def __init__(self) -> None: def __init__(self) -> None:
self._jobs: dict[str, Job] = {} self._jobs: dict[str, Job] = {}
self._load_registry()
def create_job(self, kind: str, label: str) -> Job: def create_job(self, kind: str, label: str) -> Job:
job = Job(id=uuid.uuid4().hex[:8], kind=kind, label=label) job = Job(id=uuid.uuid4().hex[:8], kind=kind, label=label)
@@ -53,6 +69,7 @@ class JobManager:
def remove_finished(self) -> None: def remove_finished(self) -> None:
self._jobs = {k: v for k, v in self._jobs.items() if v.status == "running"} self._jobs = {k: v for k, v in self._jobs.items() if v.status == "running"}
self._save_registry()
def start_job(self, app, job: Job, *cli_args: str) -> None: def start_job(self, app, job: Job, *cli_args: str) -> None:
asyncio.create_task(self.run_job(app, job, *cli_args)) asyncio.create_task(self.run_job(app, job, *cli_args))
@@ -60,6 +77,12 @@ class JobManager:
async def run_job(self, app, job: Job, *cli_args: str) -> int: async def run_job(self, app, job: Job, *cli_args: str) -> int:
proc = await start_cli_process(*cli_args) proc = await start_cli_process(*cli_args)
job._proc = proc job._proc = proc
job._pid = proc.pid
try:
job._pgid = os.getpgid(proc.pid)
except OSError:
job._pgid = None
self._save_registry()
try: try:
while True: while True:
line = await proc.stdout.readline() line = await proc.stdout.readline()
@@ -78,6 +101,8 @@ class JobManager:
finally: finally:
job.finished_at = datetime.now() job.finished_at = datetime.now()
job._proc = None job._proc = None
job._reconnected = False
self._save_registry()
rc = job.return_code if job.return_code is not None else 1 rc = job.return_code if job.return_code is not None else 1
app.post_message(JobFinished(job.id, rc)) app.post_message(JobFinished(job.id, rc))
return job.return_code if job.return_code is not None else 1 return job.return_code if job.return_code is not None else 1
@@ -98,6 +123,18 @@ class JobManager:
job = self._jobs.get(job_id) job = self._jobs.get(job_id)
if not job: if not job:
return "job not found" return "job not found"
# Reconnected jobs: use stored PID/PGID
if job._reconnected and job._pid:
try:
pgid = job._pgid or os.getpgid(job._pid)
os.killpg(pgid, signal.SIGKILL)
return f"killed pgid={pgid} (pid={job._pid})"
except (ProcessLookupError, PermissionError, OSError) as e:
try:
os.kill(job._pid, signal.SIGKILL)
return f"fallback kill pid={job._pid} ({e})"
except (ProcessLookupError, OSError) as e2:
return f"failed: {e}, {e2}"
if job._proc is None: if job._proc is None:
return f"proc is None (status={job.status})" return f"proc is None (status={job.status})"
pid = job._proc.pid pid = job._proc.pid
@@ -116,6 +153,91 @@ class JobManager:
for job in self._jobs.values(): for job in self._jobs.values():
if job._proc is not None: if job._proc is not None:
self._kill_process_group(job._proc) self._kill_process_group(job._proc)
elif job._reconnected and job._pid:
try:
pgid = job._pgid or os.getpgid(job._pid)
os.killpg(pgid, signal.SIGKILL)
except (ProcessLookupError, PermissionError, OSError):
try:
os.kill(job._pid, signal.SIGKILL)
except (ProcessLookupError, OSError):
pass
# ── Job Registry Persistence ─────────────────────────────
def _save_registry(self) -> None:
entries = []
for job in self._jobs.values():
if job.status != "running":
continue
pid = job._pid
if job._proc is not None:
pid = job._proc.pid
if pid is None:
continue
entries.append({
"id": job.id,
"kind": job.kind,
"label": job.label,
"pid": pid,
"pgid": job._pgid,
"started_at": job.started_at.isoformat(),
})
try:
REGISTRY_FILE.parent.mkdir(parents=True, exist_ok=True)
REGISTRY_FILE.write_text(json.dumps(entries, indent=2))
except OSError:
pass
def _load_registry(self) -> None:
if not REGISTRY_FILE.is_file():
return
try:
entries = json.loads(REGISTRY_FILE.read_text())
except (json.JSONDecodeError, OSError):
return
for entry in entries:
pid = entry.get("pid")
if pid is None:
continue
# Check if process is still alive
try:
os.kill(pid, 0)
except (ProcessLookupError, PermissionError):
continue
job_id = entry["id"]
if job_id in self._jobs:
continue
job = Job(
id=job_id,
kind=entry.get("kind", "backup"),
label=entry.get("label", f"Job (PID {pid})"),
status="running",
started_at=datetime.fromisoformat(entry["started_at"]),
)
job._pid = pid
job._pgid = entry.get("pgid")
job._reconnected = True
self._jobs[job.id] = job
def check_reconnected(self) -> None:
changed = False
for job in list(self._jobs.values()):
if not job._reconnected or job.status != "running":
continue
if job._pid is None:
continue
try:
os.kill(job._pid, 0)
except ProcessLookupError:
job.status = "success"
job.finished_at = datetime.now()
job._reconnected = False
changed = True
except PermissionError:
pass # Process exists but we can't signal it
if changed:
self._save_registry()
job_manager = JobManager() job_manager = JobManager()

View File

@@ -45,6 +45,7 @@ class RunningTasksScreen(Screen):
return f"{hours}h {m}m" return f"{hours}h {m}m"
def _refresh_table(self) -> None: def _refresh_table(self) -> None:
job_manager.check_reconnected()
table = self.query_one("#rt-table", DataTable) table = self.query_one("#rt-table", DataTable)
# Preserve cursor position # Preserve cursor position
old_row = table.cursor_coordinate.row if table.row_count > 0 else 0 old_row = table.cursor_coordinate.row if table.row_count > 0 else 0