fix: resolve portal startup hangs with non-blocking init (#93)
* fix: resolve portal startup hangs with non-blocking init - Add socket_connect_timeout/socket_timeout (3s) to Redis connection in SwarmComms to prevent infinite hangs when Redis is unreachable - Defer reconcile_on_startup() from SwarmCoordinator.__init__() to an explicit initialize() call during app lifespan, unblocking the module-level singleton creation - Make Ollama health checks non-blocking via asyncio.to_thread() so they don't freeze the event loop for 2s per call - Fix _check_redis() to reuse coordinator's SwarmComms singleton instead of creating a new connection on every health check - Move discord bot platform registration from lifespan critical path into background task to avoid heavy import before yield - Increase Docker healthcheck start_period from 10s/15s to 30s to give the app adequate time to complete startup https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux * fix: disable commit signing in git_tools test fixture The git_repo fixture inherits global gpgsign config, causing git_commit to fail when the signing server rejects unsigned source context. Disable signing in the temp repo's local config. https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux * fix: add dev extras for pip-based CI install The CI workflow runs `pip install -e ".[dev]"` but after the Poetry migration there was no `dev` extra defined — only a Poetry dev group. This caused pytest to not be installed, resulting in exit code 127 (command not found) on every CI run. Add a pip-compatible `dev` extra that mirrors the Poetry dev group so both `pip install -e ".[dev]"` and `poetry install` work. https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux --------- Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
committed by
GitHub
parent
ca0c42398b
commit
1e19164379
@@ -69,7 +69,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
|
||||
EXPOSE 8000
|
||||
|
||||
# ── Healthcheck ──────────────────────────────────────────────────────────────
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
|
||||
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/health || exit 1
|
||||
|
||||
# ── Default: run the dashboard ───────────────────────────────────────────────
|
||||
|
||||
@@ -52,7 +52,7 @@ services:
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
start_period: 30s
|
||||
|
||||
# ── Timmy — sovereign AI agent (separate container) ───────────────────────
|
||||
timmy:
|
||||
|
||||
@@ -49,6 +49,11 @@ python-telegram-bot = { version = ">=21.0", optional = true }
|
||||
"discord.py" = { version = ">=2.3.0", optional = true }
|
||||
airllm = { version = ">=2.9.0", optional = true }
|
||||
pyttsx3 = { version = ">=2.90", optional = true }
|
||||
pytest = { version = ">=8.0.0", optional = true }
|
||||
pytest-asyncio = { version = ">=0.24.0", optional = true }
|
||||
pytest-cov = { version = ">=5.0.0", optional = true }
|
||||
pytest-timeout = { version = ">=2.3.0", optional = true }
|
||||
selenium = { version = ">=4.20.0", optional = true }
|
||||
|
||||
[tool.poetry.extras]
|
||||
swarm = ["redis"]
|
||||
@@ -56,6 +61,7 @@ telegram = ["python-telegram-bot"]
|
||||
discord = ["discord.py"]
|
||||
bigbrain = ["airllm"]
|
||||
voice = ["pyttsx3"]
|
||||
dev = ["pytest", "pytest-asyncio", "pytest-cov", "pytest-timeout", "selenium"]
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = ">=8.0.0"
|
||||
|
||||
@@ -355,8 +355,12 @@ async def _start_chat_integrations_background() -> None:
|
||||
"""Background task: start chat integrations without blocking startup."""
|
||||
from integrations.telegram_bot.bot import telegram_bot
|
||||
from integrations.chat_bridge.vendors.discord import discord_bot
|
||||
|
||||
from integrations.chat_bridge.registry import platform_registry
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Register Discord in the platform registry
|
||||
platform_registry.register(discord_bot)
|
||||
|
||||
if settings.telegram_token:
|
||||
try:
|
||||
@@ -392,8 +396,9 @@ async def lifespan(app: FastAPI):
|
||||
agent_id="timmy",
|
||||
)
|
||||
|
||||
# Log swarm recovery summary
|
||||
# Run swarm recovery and log summary
|
||||
from swarm.coordinator import coordinator as swarm_coordinator
|
||||
swarm_coordinator.initialize()
|
||||
rec = swarm_coordinator._recovery_summary
|
||||
if rec["tasks_failed"] or rec["agents_offlined"]:
|
||||
logger.info(
|
||||
@@ -442,18 +447,14 @@ async def lifespan(app: FastAPI):
|
||||
# Start chat integrations in background
|
||||
chat_task = asyncio.create_task(_start_chat_integrations_background())
|
||||
|
||||
# Register Discord bot
|
||||
from integrations.chat_bridge.registry import platform_registry
|
||||
from integrations.chat_bridge.vendors.discord import discord_bot
|
||||
platform_registry.register(discord_bot)
|
||||
|
||||
logger.info("✓ Timmy Time dashboard ready for requests")
|
||||
|
||||
yield
|
||||
|
||||
# Cleanup on shutdown
|
||||
from integrations.telegram_bot.bot import telegram_bot
|
||||
|
||||
from integrations.chat_bridge.vendors.discord import discord_bot
|
||||
|
||||
await discord_bot.stop()
|
||||
await telegram_bot.stop()
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ Provides system health checks and sovereignty audit information
|
||||
for the Mission Control dashboard.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
@@ -23,8 +24,8 @@ router = APIRouter(tags=["health"])
|
||||
|
||||
|
||||
# Legacy health check for backward compatibility
|
||||
async def check_ollama() -> bool:
|
||||
"""Legacy helper to check Ollama status."""
|
||||
def _check_ollama_sync() -> bool:
|
||||
"""Synchronous Ollama check — run via asyncio.to_thread()."""
|
||||
try:
|
||||
import urllib.request
|
||||
url = settings.ollama_url.replace("localhost", "127.0.0.1")
|
||||
@@ -39,6 +40,14 @@ async def check_ollama() -> bool:
|
||||
return False
|
||||
|
||||
|
||||
async def check_ollama() -> bool:
|
||||
"""Check Ollama status without blocking the event loop."""
|
||||
try:
|
||||
return await asyncio.to_thread(_check_ollama_sync)
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
class DependencyStatus(BaseModel):
|
||||
"""Status of a single dependency."""
|
||||
name: str
|
||||
@@ -67,8 +76,8 @@ class HealthStatus(BaseModel):
|
||||
_START_TIME = datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _check_ollama() -> DependencyStatus:
|
||||
"""Check Ollama AI backend status."""
|
||||
def _check_ollama_status_sync() -> DependencyStatus:
|
||||
"""Synchronous Ollama status check — run via asyncio.to_thread()."""
|
||||
try:
|
||||
import urllib.request
|
||||
url = settings.ollama_url.replace("localhost", "127.0.0.1")
|
||||
@@ -90,7 +99,7 @@ def _check_ollama() -> DependencyStatus:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
return DependencyStatus(
|
||||
name="Ollama AI",
|
||||
status="unavailable",
|
||||
@@ -99,11 +108,24 @@ def _check_ollama() -> DependencyStatus:
|
||||
)
|
||||
|
||||
|
||||
async def _check_ollama() -> DependencyStatus:
|
||||
"""Check Ollama AI backend status without blocking the event loop."""
|
||||
try:
|
||||
return await asyncio.to_thread(_check_ollama_status_sync)
|
||||
except Exception:
|
||||
return DependencyStatus(
|
||||
name="Ollama AI",
|
||||
status="unavailable",
|
||||
sovereignty_score=10,
|
||||
details={"url": settings.ollama_url, "error": "Cannot connect to Ollama"},
|
||||
)
|
||||
|
||||
|
||||
def _check_redis() -> DependencyStatus:
|
||||
"""Check Redis cache status."""
|
||||
try:
|
||||
from swarm.comms import SwarmComms
|
||||
comms = SwarmComms()
|
||||
from swarm.coordinator import coordinator
|
||||
comms = coordinator.comms
|
||||
# Check if we're using fallback
|
||||
if hasattr(comms, '_redis') and comms._redis is not None:
|
||||
return DependencyStatus(
|
||||
@@ -280,7 +302,7 @@ async def sovereignty_check():
|
||||
Use this to verify the system is operating in a sovereign manner.
|
||||
"""
|
||||
dependencies = [
|
||||
_check_ollama(),
|
||||
await _check_ollama(),
|
||||
_check_redis(),
|
||||
_check_lightning(),
|
||||
_check_sqlite(),
|
||||
|
||||
@@ -56,7 +56,11 @@ class SwarmComms:
|
||||
def _try_connect(self) -> None:
|
||||
try:
|
||||
import redis
|
||||
self._redis = redis.from_url(self._redis_url)
|
||||
self._redis = redis.from_url(
|
||||
self._redis_url,
|
||||
socket_connect_timeout=3,
|
||||
socket_timeout=3,
|
||||
)
|
||||
self._redis.ping()
|
||||
self._pubsub = self._redis.pubsub()
|
||||
self._connected = True
|
||||
|
||||
@@ -53,6 +53,10 @@ class SwarmCoordinator:
|
||||
self.auctions = AuctionManager()
|
||||
self.comms = SwarmComms()
|
||||
self._in_process_nodes: list = []
|
||||
self._recovery_summary = {"tasks_failed": 0, "agents_offlined": 0}
|
||||
|
||||
def initialize(self) -> None:
|
||||
"""Run startup recovery. Call during app lifespan, not at import time."""
|
||||
self._recovery_summary = reconcile_on_startup()
|
||||
|
||||
# ── Agent lifecycle ─────────────────────────────────────────────────────
|
||||
|
||||
@@ -28,11 +28,12 @@ def git_repo(tmp_path):
|
||||
result = git_init(tmp_path)
|
||||
assert result["success"]
|
||||
|
||||
# Configure git identity for commits
|
||||
# Configure git identity and disable signing for commits
|
||||
from git import Repo
|
||||
repo = Repo(str(tmp_path))
|
||||
repo.config_writer().set_value("user", "name", "Test").release()
|
||||
repo.config_writer().set_value("user", "email", "test@test.com").release()
|
||||
repo.config_writer().set_value("commit", "gpgsign", "false").release()
|
||||
|
||||
# Create initial commit
|
||||
readme = tmp_path / "README.md"
|
||||
|
||||
@@ -156,9 +156,10 @@ def test_reconcile_counts_multiple_stale_agents():
|
||||
# ── Coordinator integration ───────────────────────────────────────────────────
|
||||
|
||||
def test_coordinator_runs_recovery_on_init():
|
||||
"""Coordinator.__init__ calls reconcile; _recovery_summary must be present."""
|
||||
"""Coordinator.initialize() populates _recovery_summary."""
|
||||
from swarm.coordinator import SwarmCoordinator
|
||||
coord = SwarmCoordinator()
|
||||
coord.initialize()
|
||||
assert hasattr(coord, "_recovery_summary")
|
||||
assert "tasks_failed" in coord._recovery_summary
|
||||
assert "agents_offlined" in coord._recovery_summary
|
||||
@@ -166,7 +167,7 @@ def test_coordinator_runs_recovery_on_init():
|
||||
|
||||
|
||||
def test_coordinator_recovery_cleans_stale_task():
|
||||
"""End-to-end: task left in BIDDING is cleaned up by a fresh coordinator."""
|
||||
"""End-to-end: task left in BIDDING is cleaned up after initialize()."""
|
||||
from swarm.tasks import create_task, get_task, update_task, TaskStatus
|
||||
from swarm.coordinator import SwarmCoordinator
|
||||
|
||||
@@ -174,6 +175,7 @@ def test_coordinator_recovery_cleans_stale_task():
|
||||
update_task(task.id, status=TaskStatus.BIDDING)
|
||||
|
||||
coord = SwarmCoordinator()
|
||||
coord.initialize()
|
||||
assert get_task(task.id).status == TaskStatus.FAILED
|
||||
assert coord._recovery_summary["tasks_failed"] >= 1
|
||||
coord.manager.stop_all()
|
||||
|
||||
Reference in New Issue
Block a user