fix: resolve portal startup hangs with non-blocking init (#93)

* fix: resolve portal startup hangs with non-blocking init

- Add socket_connect_timeout/socket_timeout (3s) to Redis connection in
  SwarmComms to prevent infinite hangs when Redis is unreachable
- Defer reconcile_on_startup() from SwarmCoordinator.__init__() to an
  explicit initialize() call during app lifespan, unblocking the
  module-level singleton creation
- Make Ollama health checks non-blocking via asyncio.to_thread() so they
  don't freeze the event loop for 2s per call
- Fix _check_redis() to reuse coordinator's SwarmComms singleton instead
  of creating a new connection on every health check
- Move discord bot platform registration from lifespan critical path
  into background task to avoid heavy import before yield
- Increase Docker healthcheck start_period from 10s/15s to 30s to give
  the app adequate time to complete startup

https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux

* fix: disable commit signing in git_tools test fixture

The git_repo fixture inherits global gpgsign config, causing git_commit
to fail when the signing server rejects unsigned source context.
Disable signing in the temp repo's local config.

https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux

* fix: add dev extras for pip-based CI install

The CI workflow runs `pip install -e ".[dev]"` but after the Poetry
migration there was no `dev` extra defined — only a Poetry dev group.
This caused pytest to not be installed, resulting in exit code 127
(command not found) on every CI run.

Add a pip-compatible `dev` extra that mirrors the Poetry dev group
so both `pip install -e ".[dev]"` and `poetry install` work.

https://claude.ai/code/session_016t5jNBYsUAQuyoR7sXe7Ux

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Alexander Whitestone
2026-02-28 15:01:48 -05:00
committed by GitHub
parent ca0c42398b
commit 1e19164379
9 changed files with 62 additions and 22 deletions

View File

@@ -69,7 +69,7 @@ ENV PYTHONDONTWRITEBYTECODE=1
EXPOSE 8000
# ── Healthcheck ──────────────────────────────────────────────────────────────
HEALTHCHECK --interval=30s --timeout=5s --start-period=15s --retries=3 \
HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# ── Default: run the dashboard ───────────────────────────────────────────────

View File

@@ -52,7 +52,7 @@ services:
interval: 30s
timeout: 5s
retries: 3
start_period: 10s
start_period: 30s
# ── Timmy — sovereign AI agent (separate container) ───────────────────────
timmy:

View File

@@ -49,6 +49,11 @@ python-telegram-bot = { version = ">=21.0", optional = true }
"discord.py" = { version = ">=2.3.0", optional = true }
airllm = { version = ">=2.9.0", optional = true }
pyttsx3 = { version = ">=2.90", optional = true }
pytest = { version = ">=8.0.0", optional = true }
pytest-asyncio = { version = ">=0.24.0", optional = true }
pytest-cov = { version = ">=5.0.0", optional = true }
pytest-timeout = { version = ">=2.3.0", optional = true }
selenium = { version = ">=4.20.0", optional = true }
[tool.poetry.extras]
swarm = ["redis"]
@@ -56,6 +61,7 @@ telegram = ["python-telegram-bot"]
discord = ["discord.py"]
bigbrain = ["airllm"]
voice = ["pyttsx3"]
dev = ["pytest", "pytest-asyncio", "pytest-cov", "pytest-timeout", "selenium"]
[tool.poetry.group.dev.dependencies]
pytest = ">=8.0.0"

View File

@@ -355,9 +355,13 @@ async def _start_chat_integrations_background() -> None:
"""Background task: start chat integrations without blocking startup."""
from integrations.telegram_bot.bot import telegram_bot
from integrations.chat_bridge.vendors.discord import discord_bot
from integrations.chat_bridge.registry import platform_registry
await asyncio.sleep(0.5)
# Register Discord in the platform registry
platform_registry.register(discord_bot)
if settings.telegram_token:
try:
await telegram_bot.start()
@@ -392,8 +396,9 @@ async def lifespan(app: FastAPI):
agent_id="timmy",
)
# Log swarm recovery summary
# Run swarm recovery and log summary
from swarm.coordinator import coordinator as swarm_coordinator
swarm_coordinator.initialize()
rec = swarm_coordinator._recovery_summary
if rec["tasks_failed"] or rec["agents_offlined"]:
logger.info(
@@ -442,17 +447,13 @@ async def lifespan(app: FastAPI):
# Start chat integrations in background
chat_task = asyncio.create_task(_start_chat_integrations_background())
# Register Discord bot
from integrations.chat_bridge.registry import platform_registry
from integrations.chat_bridge.vendors.discord import discord_bot
platform_registry.register(discord_bot)
logger.info("✓ Timmy Time dashboard ready for requests")
yield
# Cleanup on shutdown
from integrations.telegram_bot.bot import telegram_bot
from integrations.chat_bridge.vendors.discord import discord_bot
await discord_bot.stop()
await telegram_bot.stop()

View File

@@ -4,6 +4,7 @@ Provides system health checks and sovereignty audit information
for the Mission Control dashboard.
"""
import asyncio
import logging
import os
from datetime import datetime, timezone
@@ -23,8 +24,8 @@ router = APIRouter(tags=["health"])
# Legacy health check for backward compatibility
async def check_ollama() -> bool:
"""Legacy helper to check Ollama status."""
def _check_ollama_sync() -> bool:
"""Synchronous Ollama check — run via asyncio.to_thread()."""
try:
import urllib.request
url = settings.ollama_url.replace("localhost", "127.0.0.1")
@@ -39,6 +40,14 @@ async def check_ollama() -> bool:
return False
async def check_ollama() -> bool:
"""Check Ollama status without blocking the event loop."""
try:
return await asyncio.to_thread(_check_ollama_sync)
except Exception:
return False
class DependencyStatus(BaseModel):
"""Status of a single dependency."""
name: str
@@ -67,8 +76,8 @@ class HealthStatus(BaseModel):
_START_TIME = datetime.now(timezone.utc)
def _check_ollama() -> DependencyStatus:
"""Check Ollama AI backend status."""
def _check_ollama_status_sync() -> DependencyStatus:
"""Synchronous Ollama status check — run via asyncio.to_thread()."""
try:
import urllib.request
url = settings.ollama_url.replace("localhost", "127.0.0.1")
@@ -99,11 +108,24 @@ def _check_ollama() -> DependencyStatus:
)
async def _check_ollama() -> DependencyStatus:
"""Check Ollama AI backend status without blocking the event loop."""
try:
return await asyncio.to_thread(_check_ollama_status_sync)
except Exception:
return DependencyStatus(
name="Ollama AI",
status="unavailable",
sovereignty_score=10,
details={"url": settings.ollama_url, "error": "Cannot connect to Ollama"},
)
def _check_redis() -> DependencyStatus:
"""Check Redis cache status."""
try:
from swarm.comms import SwarmComms
comms = SwarmComms()
from swarm.coordinator import coordinator
comms = coordinator.comms
# Check if we're using fallback
if hasattr(comms, '_redis') and comms._redis is not None:
return DependencyStatus(
@@ -280,7 +302,7 @@ async def sovereignty_check():
Use this to verify the system is operating in a sovereign manner.
"""
dependencies = [
_check_ollama(),
await _check_ollama(),
_check_redis(),
_check_lightning(),
_check_sqlite(),

View File

@@ -56,7 +56,11 @@ class SwarmComms:
def _try_connect(self) -> None:
try:
import redis
self._redis = redis.from_url(self._redis_url)
self._redis = redis.from_url(
self._redis_url,
socket_connect_timeout=3,
socket_timeout=3,
)
self._redis.ping()
self._pubsub = self._redis.pubsub()
self._connected = True

View File

@@ -53,6 +53,10 @@ class SwarmCoordinator:
self.auctions = AuctionManager()
self.comms = SwarmComms()
self._in_process_nodes: list = []
self._recovery_summary = {"tasks_failed": 0, "agents_offlined": 0}
def initialize(self) -> None:
"""Run startup recovery. Call during app lifespan, not at import time."""
self._recovery_summary = reconcile_on_startup()
# ── Agent lifecycle ─────────────────────────────────────────────────────

View File

@@ -28,11 +28,12 @@ def git_repo(tmp_path):
result = git_init(tmp_path)
assert result["success"]
# Configure git identity for commits
# Configure git identity and disable signing for commits
from git import Repo
repo = Repo(str(tmp_path))
repo.config_writer().set_value("user", "name", "Test").release()
repo.config_writer().set_value("user", "email", "test@test.com").release()
repo.config_writer().set_value("commit", "gpgsign", "false").release()
# Create initial commit
readme = tmp_path / "README.md"

View File

@@ -156,9 +156,10 @@ def test_reconcile_counts_multiple_stale_agents():
# ── Coordinator integration ───────────────────────────────────────────────────
def test_coordinator_runs_recovery_on_init():
"""Coordinator.__init__ calls reconcile; _recovery_summary must be present."""
"""Coordinator.initialize() populates _recovery_summary."""
from swarm.coordinator import SwarmCoordinator
coord = SwarmCoordinator()
coord.initialize()
assert hasattr(coord, "_recovery_summary")
assert "tasks_failed" in coord._recovery_summary
assert "agents_offlined" in coord._recovery_summary
@@ -166,7 +167,7 @@ def test_coordinator_runs_recovery_on_init():
def test_coordinator_recovery_cleans_stale_task():
"""End-to-end: task left in BIDDING is cleaned up by a fresh coordinator."""
"""End-to-end: task left in BIDDING is cleaned up after initialize()."""
from swarm.tasks import create_task, get_task, update_task, TaskStatus
from swarm.coordinator import SwarmCoordinator
@@ -174,6 +175,7 @@ def test_coordinator_recovery_cleans_stale_task():
update_task(task.id, status=TaskStatus.BIDDING)
coord = SwarmCoordinator()
coord.initialize()
assert get_task(task.id).status == TaskStatus.FAILED
assert coord._recovery_summary["tasks_failed"] >= 1
coord.manager.stop_all()