feat: sovereign voice loop — timmy voice command

Adds fully local listen-think-speak voice interface. STT: Whisper, LLM: Ollama, TTS: Piper. No cloud, no network. - src/timmy/voice_loop.py: VoiceLoop with VAD, Whisper, Piper - src/timmy/cli.py: new voice command - pyproject.toml: voice extras updated - 20 new tests
2026-03-14 13:58:56 -04:00
parent d770d66150
commit dbadfc425d
4 changed files with 696 additions and 1 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -43,6 +43,9 @@ python-telegram-bot = { version = ">=21.0", optional = true }
 "discord.py" = { version = ">=2.3.0", optional = true }
 airllm = { version = ">=2.9.0", optional = true }
 pyttsx3 = { version = ">=2.90", optional = true }
+openai-whisper = { version = ">=20231117", optional = true }
+piper-tts = { version = ">=1.2.0", optional = true }
+sounddevice = { version = ">=0.4.6", optional = true }
 sentence-transformers = { version = ">=2.0.0", optional = true }
 numpy = { version = ">=1.24.0", optional = true }
 requests = { version = ">=2.31.0", optional = true }
@@ -59,7 +62,7 @@ pytest-xdist = { version = ">=3.5.0", optional = true }
 telegram = ["python-telegram-bot"]
 discord = ["discord.py"]
 bigbrain = ["airllm"]
-voice = ["pyttsx3"]
+voice = ["pyttsx3", "openai-whisper", "piper-tts", "sounddevice"]
 celery = ["celery"]
 embeddings = ["sentence-transformers", "numpy"]
 git = ["GitPython"]
--- a/src/timmy/cli.py
+++ b/src/timmy/cli.py
@@ -248,5 +248,37 @@ def down():
    subprocess.run(["docker", "compose", "down"], check=True)


+@app.command()
+def voice(
+    whisper_model: str = typer.Option(
+        "base.en", "--whisper", "-w", help="Whisper model: tiny.en, base.en, small.en, medium.en"
+    ),
+    use_say: bool = typer.Option(False, "--say", help="Use macOS `say` instead of Piper TTS"),
+    threshold: float = typer.Option(
+        0.015, "--threshold", "-t", help="Mic silence threshold (RMS). Lower = more sensitive."
+    ),
+    silence: float = typer.Option(1.5, "--silence", help="Seconds of silence to end recording"),
+    backend: str | None = _BACKEND_OPTION,
+    model_size: str | None = _MODEL_SIZE_OPTION,
+):
+    """Start the sovereign voice loop — listen, think, speak.
+
+    Everything runs locally: Whisper for STT, Ollama for LLM, Piper for TTS.
+    No cloud, no network calls, no microphone data leaves your machine.
+    """
+    from timmy.voice_loop import VoiceConfig, VoiceLoop
+
+    config = VoiceConfig(
+        whisper_model=whisper_model,
+        use_say_fallback=use_say,
+        silence_threshold=threshold,
+        silence_duration=silence,
+        backend=backend,
+        model_size=model_size,
+    )
+    loop = VoiceLoop(config=config)
+    loop.run()
+
+
 def main():
    app()
--- a/src/timmy/voice_loop.py
+++ b/src/timmy/voice_loop.py
@@ -0,0 +1,387 @@
+"""Sovereign voice loop — listen, think, speak.
+
+A fully local voice interface for Timmy. No cloud, no network calls.
+All processing happens on the user's machine:
+
+    Mic → VAD/silence detection → Whisper (local STT) → Timmy chat → Piper TTS → Speaker
+
+Usage:
+    from timmy.voice_loop import VoiceLoop
+    loop = VoiceLoop()
+    loop.run()  # blocks, Ctrl-C to stop
+
+Requires: sounddevice, numpy, whisper, piper-tts
+"""
+
+import asyncio
+import logging
+import subprocess
+import sys
+import tempfile
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# ── Defaults ────────────────────────────────────────────────────────────────
+
+DEFAULT_WHISPER_MODEL = "base.en"
+DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx"
+DEFAULT_SAMPLE_RATE = 16000  # Whisper expects 16 kHz
+DEFAULT_CHANNELS = 1
+DEFAULT_SILENCE_THRESHOLD = 0.015  # RMS threshold — tune for your mic/room
+DEFAULT_SILENCE_DURATION = 1.5  # seconds of silence to end utterance
+DEFAULT_MIN_UTTERANCE = 0.5  # ignore clicks/bumps shorter than this
+DEFAULT_MAX_UTTERANCE = 30.0  # safety cap — don't record forever
+DEFAULT_SESSION_ID = "voice"
+
+
+@dataclass
+class VoiceConfig:
+    """Configuration for the voice loop."""
+
+    whisper_model: str = DEFAULT_WHISPER_MODEL
+    piper_voice: Path = DEFAULT_PIPER_VOICE
+    sample_rate: int = DEFAULT_SAMPLE_RATE
+    silence_threshold: float = DEFAULT_SILENCE_THRESHOLD
+    silence_duration: float = DEFAULT_SILENCE_DURATION
+    min_utterance: float = DEFAULT_MIN_UTTERANCE
+    max_utterance: float = DEFAULT_MAX_UTTERANCE
+    session_id: str = DEFAULT_SESSION_ID
+    # Set True to use macOS `say` instead of Piper
+    use_say_fallback: bool = False
+    # Piper speaking rate (default 1.0, lower = slower)
+    speaking_rate: float = 1.0
+    # Backend/model for Timmy inference
+    backend: str | None = None
+    model_size: str | None = None
+
+
+class VoiceLoop:
+    """Sovereign listen-think-speak loop.
+
+    Everything runs locally:
+    - STT: OpenAI Whisper (local model, no API)
+    - LLM: Timmy via Ollama (local inference)
+    - TTS: Piper (local ONNX model) or macOS `say`
+    """
+
+    def __init__(self, config: VoiceConfig | None = None) -> None:
+        self.config = config or VoiceConfig()
+        self._whisper_model = None
+        self._running = False
+        self._speaking = False  # True while TTS is playing
+        self._interrupted = False  # set when user talks over TTS
+
+    # ── Lazy initialization ─────────────────────────────────────────────
+
+    def _load_whisper(self):
+        """Load Whisper model (lazy, first use only)."""
+        if self._whisper_model is not None:
+            return
+        import whisper
+
+        logger.info("Loading Whisper model: %s", self.config.whisper_model)
+        self._whisper_model = whisper.load_model(self.config.whisper_model)
+        logger.info("Whisper model loaded.")
+
+    def _ensure_piper(self) -> bool:
+        """Check that Piper voice model exists."""
+        if self.config.use_say_fallback:
+            return True
+        voice_path = self.config.piper_voice
+        if not voice_path.exists():
+            logger.warning("Piper voice not found at %s — falling back to `say`", voice_path)
+            self.config.use_say_fallback = True
+            return True
+        return True
+
+    # ── STT: Microphone → Text ──────────────────────────────────────────
+
+    def _record_utterance(self) -> np.ndarray | None:
+        """Record from microphone until silence is detected.
+
+        Uses energy-based Voice Activity Detection:
+        1. Wait for speech (RMS above threshold)
+        2. Record until silence (RMS below threshold for silence_duration)
+        3. Return the audio as a numpy array
+
+        Returns None if interrupted or no speech detected.
+        """
+        import sounddevice as sd
+
+        sr = self.config.sample_rate
+        block_size = int(sr * 0.1)  # 100ms blocks
+        silence_blocks = int(self.config.silence_duration / 0.1)
+        min_blocks = int(self.config.min_utterance / 0.1)
+        max_blocks = int(self.config.max_utterance / 0.1)
+
+        audio_chunks: list[np.ndarray] = []
+        silent_count = 0
+        recording = False
+
+        def _rms(block: np.ndarray) -> float:
+            return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
+
+        sys.stdout.write("\n  🎤 Listening... (speak now)\n")
+        sys.stdout.flush()
+
+        with sd.InputStream(
+            samplerate=sr,
+            channels=DEFAULT_CHANNELS,
+            dtype="float32",
+            blocksize=block_size,
+        ) as stream:
+            while self._running:
+                block, overflowed = stream.read(block_size)
+                if overflowed:
+                    logger.debug("Audio buffer overflowed")
+
+                rms = _rms(block)
+
+                if not recording:
+                    if rms > self.config.silence_threshold:
+                        recording = True
+                        silent_count = 0
+                        audio_chunks.append(block.copy())
+                        sys.stdout.write("  📢 Recording...\r")
+                        sys.stdout.flush()
+                else:
+                    audio_chunks.append(block.copy())
+
+                    if rms < self.config.silence_threshold:
+                        silent_count += 1
+                    else:
+                        silent_count = 0
+
+                    # End of utterance
+                    if silent_count >= silence_blocks:
+                        break
+
+                    # Safety cap
+                    if len(audio_chunks) >= max_blocks:
+                        logger.info("Max utterance length reached, stopping.")
+                        break
+
+        if not audio_chunks or len(audio_chunks) < min_blocks:
+            return None
+
+        audio = np.concatenate(audio_chunks, axis=0).flatten()
+        duration = len(audio) / sr
+        sys.stdout.write(f"  ✂️  Captured {duration:.1f}s of audio\n")
+        sys.stdout.flush()
+        return audio
+
+    def _transcribe(self, audio: np.ndarray) -> str:
+        """Transcribe audio using local Whisper model."""
+        self._load_whisper()
+
+        sys.stdout.write("  🧠 Transcribing...\r")
+        sys.stdout.flush()
+
+        t0 = time.monotonic()
+        result = self._whisper_model.transcribe(
+            audio,
+            language="en",
+            fp16=False,  # MPS/CPU — fp16 can cause issues on some setups
+        )
+        elapsed = time.monotonic() - t0
+
+        text = result["text"].strip()
+        logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80])
+        return text
+
+    # ── TTS: Text → Speaker ─────────────────────────────────────────────
+
+    def _speak(self, text: str) -> None:
+        """Speak text aloud using Piper TTS or macOS `say`."""
+        if not text:
+            return
+
+        self._speaking = True
+        try:
+            if self.config.use_say_fallback:
+                self._speak_say(text)
+            else:
+                self._speak_piper(text)
+        finally:
+            self._speaking = False
+
+    def _speak_piper(self, text: str) -> None:
+        """Speak using Piper TTS (local ONNX inference)."""
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            tmp_path = tmp.name
+
+        try:
+            # Generate WAV with Piper
+            cmd = [
+                "piper",
+                "--model",
+                str(self.config.piper_voice),
+                "--output_file",
+                tmp_path,
+            ]
+
+            proc = subprocess.run(
+                cmd,
+                input=text,
+                capture_output=True,
+                text=True,
+                timeout=30,
+            )
+
+            if proc.returncode != 0:
+                logger.error("Piper failed: %s", proc.stderr)
+                self._speak_say(text)  # fallback
+                return
+
+            # Play with afplay (macOS) — interruptible
+            self._play_audio(tmp_path)
+
+        finally:
+            Path(tmp_path).unlink(missing_ok=True)
+
+    def _speak_say(self, text: str) -> None:
+        """Speak using macOS `say` command."""
+        try:
+            proc = subprocess.Popen(
+                ["say", "-r", "180", text],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            proc.wait(timeout=60)
+        except subprocess.TimeoutExpired:
+            proc.kill()
+        except FileNotFoundError:
+            logger.error("macOS `say` command not found")
+
+    def _play_audio(self, path: str) -> None:
+        """Play a WAV file. Can be interrupted by setting self._interrupted."""
+        try:
+            proc = subprocess.Popen(
+                ["afplay", path],
+                stdout=subprocess.DEVNULL,
+                stderr=subprocess.DEVNULL,
+            )
+            # Poll so we can interrupt
+            while proc.poll() is None:
+                if self._interrupted:
+                    proc.terminate()
+                    self._interrupted = False
+                    logger.info("TTS interrupted by user")
+                    return
+                time.sleep(0.05)
+        except FileNotFoundError:
+            # Not macOS — try aplay (Linux)
+            try:
+                subprocess.run(["aplay", path], capture_output=True, timeout=60)
+            except (FileNotFoundError, subprocess.TimeoutExpired):
+                logger.error("No audio player found (tried afplay, aplay)")
+
+    # ── LLM: Text → Response ───────────────────────────────────────────
+
+    def _think(self, user_text: str) -> str:
+        """Send text to Timmy and get a response."""
+        sys.stdout.write("  💭 Thinking...\r")
+        sys.stdout.flush()
+
+        t0 = time.monotonic()
+
+        try:
+            response = asyncio.run(self._chat(user_text))
+        except Exception as exc:
+            logger.error("Timmy chat failed: %s", exc)
+            response = "I'm having trouble thinking right now. Could you try again?"
+
+        elapsed = time.monotonic() - t0
+        logger.info("Timmy responded in %.1fs", elapsed)
+        return response
+
+    async def _chat(self, message: str) -> str:
+        """Async wrapper around Timmy's session.chat()."""
+        from timmy.session import chat
+
+        return await chat(message, session_id=self.config.session_id)
+
+    # ── Main Loop ───────────────────────────────────────────────────────
+
+    def run(self) -> None:
+        """Run the voice loop. Blocks until Ctrl-C."""
+        self._ensure_piper()
+
+        tts_label = (
+            "macOS say"
+            if self.config.use_say_fallback
+            else f"Piper ({self.config.piper_voice.name})"
+        )
+        print(
+            f"\n{'=' * 60}\n"
+            f"  🎙️  Timmy Voice — Sovereign Voice Interface\n"
+            f"{'=' * 60}\n"
+            f"  STT:  Whisper ({self.config.whisper_model})\n"
+            f"  TTS:  {tts_label}\n"
+            f"  LLM:  Timmy (local Ollama)\n"
+            f"{'=' * 60}\n"
+            f"  Speak naturally. Timmy will listen, think, and respond.\n"
+            f"  Press Ctrl-C to exit.\n"
+            f"{'=' * 60}"
+        )
+
+        self._running = True
+
+        try:
+            while self._running:
+                # 1. LISTEN — record until silence
+                audio = self._record_utterance()
+                if audio is None:
+                    continue
+
+                # 2. TRANSCRIBE — Whisper STT
+                text = self._transcribe(audio)
+                if not text or text.lower() in (
+                    "you",
+                    "thanks.",
+                    "thank you.",
+                    "bye.",
+                    "",
+                    "thanks for watching!",
+                    "thank you for watching!",
+                ):
+                    # Whisper hallucinations on silence/noise
+                    logger.debug("Ignoring likely Whisper hallucination: '%s'", text)
+                    continue
+
+                sys.stdout.write(f"\n  👤 You: {text}\n")
+                sys.stdout.flush()
+
+                # Exit commands
+                if text.lower().strip().rstrip(".!") in (
+                    "goodbye",
+                    "exit",
+                    "quit",
+                    "stop",
+                    "goodbye timmy",
+                    "stop listening",
+                ):
+                    print("\n  👋 Goodbye!\n")
+                    break
+
+                # 3. THINK — send to Timmy
+                response = self._think(text)
+                sys.stdout.write(f"  🤖 Timmy: {response}\n")
+                sys.stdout.flush()
+
+                # 4. SPEAK — TTS output
+                self._speak(response)
+
+        except KeyboardInterrupt:
+            print("\n\n  👋 Voice loop stopped.\n")
+        finally:
+            self._running = False
+
+    def stop(self) -> None:
+        """Stop the voice loop (from another thread)."""
+        self._running = False
--- a/tests/timmy/test_voice_loop.py
+++ b/tests/timmy/test_voice_loop.py
@@ -0,0 +1,273 @@
+"""Tests for the sovereign voice loop.
+
+These tests verify the VoiceLoop components without requiring a microphone,
+Whisper model, or Piper installation — all I/O is mocked.
+"""
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+
+from timmy.voice_loop import VoiceConfig, VoiceLoop
+
+# ── VoiceConfig tests ──────────────────────────────────────────────────────
+
+
+class TestVoiceConfig:
+    def test_defaults(self):
+        cfg = VoiceConfig()
+        assert cfg.whisper_model == "base.en"
+        assert cfg.sample_rate == 16000
+        assert cfg.silence_threshold == 0.015
+        assert cfg.silence_duration == 1.5
+        assert cfg.min_utterance == 0.5
+        assert cfg.max_utterance == 30.0
+        assert cfg.session_id == "voice"
+        assert cfg.use_say_fallback is False
+
+    def test_custom_values(self):
+        cfg = VoiceConfig(
+            whisper_model="tiny.en",
+            silence_threshold=0.02,
+            session_id="custom",
+            use_say_fallback=True,
+        )
+        assert cfg.whisper_model == "tiny.en"
+        assert cfg.silence_threshold == 0.02
+        assert cfg.session_id == "custom"
+        assert cfg.use_say_fallback is True
+
+
+# ── VoiceLoop unit tests ──────────────────────────────────────────────────
+
+
+class TestVoiceLoopInit:
+    def test_default_config(self):
+        loop = VoiceLoop()
+        assert loop.config.whisper_model == "base.en"
+        assert loop._running is False
+        assert loop._speaking is False
+
+    def test_custom_config(self):
+        cfg = VoiceConfig(whisper_model="tiny.en")
+        loop = VoiceLoop(config=cfg)
+        assert loop.config.whisper_model == "tiny.en"
+
+
+class TestPiperFallback:
+    def test_falls_back_to_say_when_no_voice_file(self):
+        cfg = VoiceConfig(piper_voice=Path("/nonexistent/voice.onnx"))
+        loop = VoiceLoop(config=cfg)
+        loop._ensure_piper()
+        assert loop.config.use_say_fallback is True
+
+    def test_keeps_piper_when_voice_exists(self, tmp_path):
+        voice_file = tmp_path / "test.onnx"
+        voice_file.write_bytes(b"fake model")
+        cfg = VoiceConfig(piper_voice=voice_file)
+        loop = VoiceLoop(config=cfg)
+        loop._ensure_piper()
+        assert loop.config.use_say_fallback is False
+
+
+class TestTranscribe:
+    def test_transcribes_audio(self):
+        """Whisper transcription returns cleaned text."""
+        loop = VoiceLoop()
+
+        mock_model = MagicMock()
+        mock_model.transcribe.return_value = {"text": "  Hello Timmy  "}
+        loop._whisper_model = mock_model
+
+        audio = np.random.randn(16000).astype(np.float32)
+        result = loop._transcribe(audio)
+
+        assert result == "Hello Timmy"
+        mock_model.transcribe.assert_called_once()
+
+    def test_transcribes_empty_returns_empty(self):
+        loop = VoiceLoop()
+        mock_model = MagicMock()
+        mock_model.transcribe.return_value = {"text": "   "}
+        loop._whisper_model = mock_model
+
+        audio = np.random.randn(16000).astype(np.float32)
+        result = loop._transcribe(audio)
+        assert result == ""
+
+
+class TestThink:
+    @patch("timmy.voice_loop.asyncio")
+    def test_think_returns_response(self, mock_asyncio):
+        mock_asyncio.run.return_value = "I am Timmy."
+        loop = VoiceLoop()
+        result = loop._think("Who are you?")
+        assert result == "I am Timmy."
+
+    @patch("timmy.voice_loop.asyncio")
+    def test_think_handles_error(self, mock_asyncio):
+        mock_asyncio.run.side_effect = RuntimeError("Ollama down")
+        loop = VoiceLoop()
+        result = loop._think("test")
+        assert "trouble" in result.lower()
+
+
+class TestSpeakSay:
+    @patch("subprocess.Popen")
+    def test_speak_say_calls_subprocess(self, mock_popen):
+        mock_proc = MagicMock()
+        mock_proc.wait.return_value = 0
+        mock_popen.return_value = mock_proc
+
+        cfg = VoiceConfig(use_say_fallback=True)
+        loop = VoiceLoop(config=cfg)
+        loop._speak_say("Hello")
+
+        mock_popen.assert_called_once()
+        args = mock_popen.call_args[0][0]
+        assert args[0] == "say"
+        assert "Hello" in args
+
+    @patch("subprocess.Popen", side_effect=FileNotFoundError)
+    def test_speak_say_handles_missing(self, mock_popen):
+        cfg = VoiceConfig(use_say_fallback=True)
+        loop = VoiceLoop(config=cfg)
+        # Should not raise
+        loop._speak_say("Hello")
+
+
+class TestSpeakPiper:
+    @patch("timmy.voice_loop.VoiceLoop._play_audio")
+    @patch("subprocess.run")
+    def test_speak_piper_generates_and_plays(self, mock_run, mock_play):
+        mock_run.return_value = MagicMock(returncode=0, stderr="")
+
+        voice_path = Path("/tmp/test_voice.onnx")
+        cfg = VoiceConfig(piper_voice=voice_path)
+        loop = VoiceLoop(config=cfg)
+        loop._speak_piper("Hello from Piper")
+
+        # Piper was called
+        mock_run.assert_called_once()
+        cmd = mock_run.call_args[0][0]
+        assert cmd[0] == "piper"
+        assert "--model" in cmd
+
+        # Audio was played
+        mock_play.assert_called_once()
+
+    @patch("timmy.voice_loop.VoiceLoop._speak_say")
+    @patch("subprocess.run")
+    def test_speak_piper_falls_back_on_error(self, mock_run, mock_say):
+        mock_run.return_value = MagicMock(returncode=1, stderr="model error")
+
+        cfg = VoiceConfig(piper_voice=Path("/tmp/test.onnx"))
+        loop = VoiceLoop(config=cfg)
+        loop._speak_piper("test")
+
+        # Should fall back to say
+        mock_say.assert_called_once_with("test")
+
+
+class TestHallucinationFilter:
+    """Whisper tends to hallucinate on silence/noise. The loop should filter these."""
+
+    def test_known_hallucinations_filtered(self):
+        hallucinations = [
+            "you",
+            "thanks.",
+            "Thank you.",
+            "Bye.",
+            "Thanks for watching!",
+            "Thank you for watching!",
+        ]
+        for text in hallucinations:
+            assert text.lower() in (
+                "you",
+                "thanks.",
+                "thank you.",
+                "bye.",
+                "",
+                "thanks for watching!",
+                "thank you for watching!",
+            ), f"'{text}' should be filtered"
+
+
+class TestExitCommands:
+    """Voice loop should recognize exit commands."""
+
+    def test_exit_commands(self):
+        exits = ["goodbye", "exit", "quit", "stop", "goodbye timmy", "stop listening"]
+        for cmd in exits:
+            assert cmd.lower().strip().rstrip(".!") in (
+                "goodbye",
+                "exit",
+                "quit",
+                "stop",
+                "goodbye timmy",
+                "stop listening",
+            ), f"'{cmd}' should be an exit command"
+
+
+class TestPlayAudio:
+    @patch("subprocess.Popen")
+    def test_play_audio_calls_afplay(self, mock_popen):
+        mock_proc = MagicMock()
+        mock_proc.poll.side_effect = [None, 0]  # Running, then done
+        mock_popen.return_value = mock_proc
+
+        loop = VoiceLoop()
+        loop._play_audio("/tmp/test.wav")
+
+        mock_popen.assert_called_once()
+        args = mock_popen.call_args[0][0]
+        assert args[0] == "afplay"
+
+    @patch("subprocess.Popen")
+    def test_play_audio_interruptible(self, mock_popen):
+        mock_proc = MagicMock()
+        # Simulate running, then we interrupt
+        call_count = 0
+
+        def poll_side_effect():
+            nonlocal call_count
+            call_count += 1
+            return None  # Always running
+
+        mock_proc.poll.side_effect = poll_side_effect
+        mock_popen.return_value = mock_proc
+
+        loop = VoiceLoop()
+        loop._interrupted = True  # Pre-set interrupt
+        loop._play_audio("/tmp/test.wav")
+
+        mock_proc.terminate.assert_called_once()
+
+
+class TestStopMethod:
+    def test_stop_sets_running_false(self):
+        loop = VoiceLoop()
+        loop._running = True
+        loop.stop()
+        assert loop._running is False
+
+
+class TestSpeakSetsFlag:
+    @patch("timmy.voice_loop.VoiceLoop._speak_say")
+    def test_speaking_flag_set_during_speech(self, mock_say):
+        cfg = VoiceConfig(use_say_fallback=True)
+        loop = VoiceLoop(config=cfg)
+
+        # Before speak
+        assert loop._speaking is False
+
+        # Mock say to check flag during execution
+        def check_flag(text):
+            assert loop._speaking is True
+
+        mock_say.side_effect = check_flag
+        loop._speak("Hello")
+
+        # After speak
+        assert loop._speaking is False