feat: sovereign voice loop — timmy voice command
Adds fully local listen-think-speak voice interface. STT: Whisper, LLM: Ollama, TTS: Piper. No cloud, no network. - src/timmy/voice_loop.py: VoiceLoop with VAD, Whisper, Piper - src/timmy/cli.py: new voice command - pyproject.toml: voice extras updated - 20 new tests
This commit is contained in:
@@ -43,6 +43,9 @@ python-telegram-bot = { version = ">=21.0", optional = true }
|
||||
"discord.py" = { version = ">=2.3.0", optional = true }
|
||||
airllm = { version = ">=2.9.0", optional = true }
|
||||
pyttsx3 = { version = ">=2.90", optional = true }
|
||||
openai-whisper = { version = ">=20231117", optional = true }
|
||||
piper-tts = { version = ">=1.2.0", optional = true }
|
||||
sounddevice = { version = ">=0.4.6", optional = true }
|
||||
sentence-transformers = { version = ">=2.0.0", optional = true }
|
||||
numpy = { version = ">=1.24.0", optional = true }
|
||||
requests = { version = ">=2.31.0", optional = true }
|
||||
@@ -59,7 +62,7 @@ pytest-xdist = { version = ">=3.5.0", optional = true }
|
||||
telegram = ["python-telegram-bot"]
|
||||
discord = ["discord.py"]
|
||||
bigbrain = ["airllm"]
|
||||
voice = ["pyttsx3"]
|
||||
voice = ["pyttsx3", "openai-whisper", "piper-tts", "sounddevice"]
|
||||
celery = ["celery"]
|
||||
embeddings = ["sentence-transformers", "numpy"]
|
||||
git = ["GitPython"]
|
||||
|
||||
@@ -248,5 +248,37 @@ def down():
|
||||
subprocess.run(["docker", "compose", "down"], check=True)
|
||||
|
||||
|
||||
@app.command()
|
||||
def voice(
|
||||
whisper_model: str = typer.Option(
|
||||
"base.en", "--whisper", "-w", help="Whisper model: tiny.en, base.en, small.en, medium.en"
|
||||
),
|
||||
use_say: bool = typer.Option(False, "--say", help="Use macOS `say` instead of Piper TTS"),
|
||||
threshold: float = typer.Option(
|
||||
0.015, "--threshold", "-t", help="Mic silence threshold (RMS). Lower = more sensitive."
|
||||
),
|
||||
silence: float = typer.Option(1.5, "--silence", help="Seconds of silence to end recording"),
|
||||
backend: str | None = _BACKEND_OPTION,
|
||||
model_size: str | None = _MODEL_SIZE_OPTION,
|
||||
):
|
||||
"""Start the sovereign voice loop — listen, think, speak.
|
||||
|
||||
Everything runs locally: Whisper for STT, Ollama for LLM, Piper for TTS.
|
||||
No cloud, no network calls, no microphone data leaves your machine.
|
||||
"""
|
||||
from timmy.voice_loop import VoiceConfig, VoiceLoop
|
||||
|
||||
config = VoiceConfig(
|
||||
whisper_model=whisper_model,
|
||||
use_say_fallback=use_say,
|
||||
silence_threshold=threshold,
|
||||
silence_duration=silence,
|
||||
backend=backend,
|
||||
model_size=model_size,
|
||||
)
|
||||
loop = VoiceLoop(config=config)
|
||||
loop.run()
|
||||
|
||||
|
||||
def main():
|
||||
app()
|
||||
|
||||
387
src/timmy/voice_loop.py
Normal file
387
src/timmy/voice_loop.py
Normal file
@@ -0,0 +1,387 @@
|
||||
"""Sovereign voice loop — listen, think, speak.
|
||||
|
||||
A fully local voice interface for Timmy. No cloud, no network calls.
|
||||
All processing happens on the user's machine:
|
||||
|
||||
Mic → VAD/silence detection → Whisper (local STT) → Timmy chat → Piper TTS → Speaker
|
||||
|
||||
Usage:
|
||||
from timmy.voice_loop import VoiceLoop
|
||||
loop = VoiceLoop()
|
||||
loop.run() # blocks, Ctrl-C to stop
|
||||
|
||||
Requires: sounddevice, numpy, whisper, piper-tts
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Defaults ────────────────────────────────────────────────────────────────
|
||||
|
||||
DEFAULT_WHISPER_MODEL = "base.en"
|
||||
DEFAULT_PIPER_VOICE = Path.home() / ".local/share/piper-voices/en_US-lessac-medium.onnx"
|
||||
DEFAULT_SAMPLE_RATE = 16000 # Whisper expects 16 kHz
|
||||
DEFAULT_CHANNELS = 1
|
||||
DEFAULT_SILENCE_THRESHOLD = 0.015 # RMS threshold — tune for your mic/room
|
||||
DEFAULT_SILENCE_DURATION = 1.5 # seconds of silence to end utterance
|
||||
DEFAULT_MIN_UTTERANCE = 0.5 # ignore clicks/bumps shorter than this
|
||||
DEFAULT_MAX_UTTERANCE = 30.0 # safety cap — don't record forever
|
||||
DEFAULT_SESSION_ID = "voice"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceConfig:
|
||||
"""Configuration for the voice loop."""
|
||||
|
||||
whisper_model: str = DEFAULT_WHISPER_MODEL
|
||||
piper_voice: Path = DEFAULT_PIPER_VOICE
|
||||
sample_rate: int = DEFAULT_SAMPLE_RATE
|
||||
silence_threshold: float = DEFAULT_SILENCE_THRESHOLD
|
||||
silence_duration: float = DEFAULT_SILENCE_DURATION
|
||||
min_utterance: float = DEFAULT_MIN_UTTERANCE
|
||||
max_utterance: float = DEFAULT_MAX_UTTERANCE
|
||||
session_id: str = DEFAULT_SESSION_ID
|
||||
# Set True to use macOS `say` instead of Piper
|
||||
use_say_fallback: bool = False
|
||||
# Piper speaking rate (default 1.0, lower = slower)
|
||||
speaking_rate: float = 1.0
|
||||
# Backend/model for Timmy inference
|
||||
backend: str | None = None
|
||||
model_size: str | None = None
|
||||
|
||||
|
||||
class VoiceLoop:
|
||||
"""Sovereign listen-think-speak loop.
|
||||
|
||||
Everything runs locally:
|
||||
- STT: OpenAI Whisper (local model, no API)
|
||||
- LLM: Timmy via Ollama (local inference)
|
||||
- TTS: Piper (local ONNX model) or macOS `say`
|
||||
"""
|
||||
|
||||
def __init__(self, config: VoiceConfig | None = None) -> None:
|
||||
self.config = config or VoiceConfig()
|
||||
self._whisper_model = None
|
||||
self._running = False
|
||||
self._speaking = False # True while TTS is playing
|
||||
self._interrupted = False # set when user talks over TTS
|
||||
|
||||
# ── Lazy initialization ─────────────────────────────────────────────
|
||||
|
||||
def _load_whisper(self):
|
||||
"""Load Whisper model (lazy, first use only)."""
|
||||
if self._whisper_model is not None:
|
||||
return
|
||||
import whisper
|
||||
|
||||
logger.info("Loading Whisper model: %s", self.config.whisper_model)
|
||||
self._whisper_model = whisper.load_model(self.config.whisper_model)
|
||||
logger.info("Whisper model loaded.")
|
||||
|
||||
def _ensure_piper(self) -> bool:
|
||||
"""Check that Piper voice model exists."""
|
||||
if self.config.use_say_fallback:
|
||||
return True
|
||||
voice_path = self.config.piper_voice
|
||||
if not voice_path.exists():
|
||||
logger.warning("Piper voice not found at %s — falling back to `say`", voice_path)
|
||||
self.config.use_say_fallback = True
|
||||
return True
|
||||
return True
|
||||
|
||||
# ── STT: Microphone → Text ──────────────────────────────────────────
|
||||
|
||||
def _record_utterance(self) -> np.ndarray | None:
|
||||
"""Record from microphone until silence is detected.
|
||||
|
||||
Uses energy-based Voice Activity Detection:
|
||||
1. Wait for speech (RMS above threshold)
|
||||
2. Record until silence (RMS below threshold for silence_duration)
|
||||
3. Return the audio as a numpy array
|
||||
|
||||
Returns None if interrupted or no speech detected.
|
||||
"""
|
||||
import sounddevice as sd
|
||||
|
||||
sr = self.config.sample_rate
|
||||
block_size = int(sr * 0.1) # 100ms blocks
|
||||
silence_blocks = int(self.config.silence_duration / 0.1)
|
||||
min_blocks = int(self.config.min_utterance / 0.1)
|
||||
max_blocks = int(self.config.max_utterance / 0.1)
|
||||
|
||||
audio_chunks: list[np.ndarray] = []
|
||||
silent_count = 0
|
||||
recording = False
|
||||
|
||||
def _rms(block: np.ndarray) -> float:
|
||||
return float(np.sqrt(np.mean(block.astype(np.float32) ** 2)))
|
||||
|
||||
sys.stdout.write("\n 🎤 Listening... (speak now)\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
with sd.InputStream(
|
||||
samplerate=sr,
|
||||
channels=DEFAULT_CHANNELS,
|
||||
dtype="float32",
|
||||
blocksize=block_size,
|
||||
) as stream:
|
||||
while self._running:
|
||||
block, overflowed = stream.read(block_size)
|
||||
if overflowed:
|
||||
logger.debug("Audio buffer overflowed")
|
||||
|
||||
rms = _rms(block)
|
||||
|
||||
if not recording:
|
||||
if rms > self.config.silence_threshold:
|
||||
recording = True
|
||||
silent_count = 0
|
||||
audio_chunks.append(block.copy())
|
||||
sys.stdout.write(" 📢 Recording...\r")
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
audio_chunks.append(block.copy())
|
||||
|
||||
if rms < self.config.silence_threshold:
|
||||
silent_count += 1
|
||||
else:
|
||||
silent_count = 0
|
||||
|
||||
# End of utterance
|
||||
if silent_count >= silence_blocks:
|
||||
break
|
||||
|
||||
# Safety cap
|
||||
if len(audio_chunks) >= max_blocks:
|
||||
logger.info("Max utterance length reached, stopping.")
|
||||
break
|
||||
|
||||
if not audio_chunks or len(audio_chunks) < min_blocks:
|
||||
return None
|
||||
|
||||
audio = np.concatenate(audio_chunks, axis=0).flatten()
|
||||
duration = len(audio) / sr
|
||||
sys.stdout.write(f" ✂️ Captured {duration:.1f}s of audio\n")
|
||||
sys.stdout.flush()
|
||||
return audio
|
||||
|
||||
def _transcribe(self, audio: np.ndarray) -> str:
|
||||
"""Transcribe audio using local Whisper model."""
|
||||
self._load_whisper()
|
||||
|
||||
sys.stdout.write(" 🧠 Transcribing...\r")
|
||||
sys.stdout.flush()
|
||||
|
||||
t0 = time.monotonic()
|
||||
result = self._whisper_model.transcribe(
|
||||
audio,
|
||||
language="en",
|
||||
fp16=False, # MPS/CPU — fp16 can cause issues on some setups
|
||||
)
|
||||
elapsed = time.monotonic() - t0
|
||||
|
||||
text = result["text"].strip()
|
||||
logger.info("Whisper transcribed in %.1fs: '%s'", elapsed, text[:80])
|
||||
return text
|
||||
|
||||
# ── TTS: Text → Speaker ─────────────────────────────────────────────
|
||||
|
||||
def _speak(self, text: str) -> None:
|
||||
"""Speak text aloud using Piper TTS or macOS `say`."""
|
||||
if not text:
|
||||
return
|
||||
|
||||
self._speaking = True
|
||||
try:
|
||||
if self.config.use_say_fallback:
|
||||
self._speak_say(text)
|
||||
else:
|
||||
self._speak_piper(text)
|
||||
finally:
|
||||
self._speaking = False
|
||||
|
||||
def _speak_piper(self, text: str) -> None:
|
||||
"""Speak using Piper TTS (local ONNX inference)."""
|
||||
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
# Generate WAV with Piper
|
||||
cmd = [
|
||||
"piper",
|
||||
"--model",
|
||||
str(self.config.piper_voice),
|
||||
"--output_file",
|
||||
tmp_path,
|
||||
]
|
||||
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
input=text,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
logger.error("Piper failed: %s", proc.stderr)
|
||||
self._speak_say(text) # fallback
|
||||
return
|
||||
|
||||
# Play with afplay (macOS) — interruptible
|
||||
self._play_audio(tmp_path)
|
||||
|
||||
finally:
|
||||
Path(tmp_path).unlink(missing_ok=True)
|
||||
|
||||
def _speak_say(self, text: str) -> None:
|
||||
"""Speak using macOS `say` command."""
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
["say", "-r", "180", text],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
proc.wait(timeout=60)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
except FileNotFoundError:
|
||||
logger.error("macOS `say` command not found")
|
||||
|
||||
def _play_audio(self, path: str) -> None:
|
||||
"""Play a WAV file. Can be interrupted by setting self._interrupted."""
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
["afplay", path],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
# Poll so we can interrupt
|
||||
while proc.poll() is None:
|
||||
if self._interrupted:
|
||||
proc.terminate()
|
||||
self._interrupted = False
|
||||
logger.info("TTS interrupted by user")
|
||||
return
|
||||
time.sleep(0.05)
|
||||
except FileNotFoundError:
|
||||
# Not macOS — try aplay (Linux)
|
||||
try:
|
||||
subprocess.run(["aplay", path], capture_output=True, timeout=60)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
logger.error("No audio player found (tried afplay, aplay)")
|
||||
|
||||
# ── LLM: Text → Response ───────────────────────────────────────────
|
||||
|
||||
def _think(self, user_text: str) -> str:
|
||||
"""Send text to Timmy and get a response."""
|
||||
sys.stdout.write(" 💭 Thinking...\r")
|
||||
sys.stdout.flush()
|
||||
|
||||
t0 = time.monotonic()
|
||||
|
||||
try:
|
||||
response = asyncio.run(self._chat(user_text))
|
||||
except Exception as exc:
|
||||
logger.error("Timmy chat failed: %s", exc)
|
||||
response = "I'm having trouble thinking right now. Could you try again?"
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
logger.info("Timmy responded in %.1fs", elapsed)
|
||||
return response
|
||||
|
||||
async def _chat(self, message: str) -> str:
|
||||
"""Async wrapper around Timmy's session.chat()."""
|
||||
from timmy.session import chat
|
||||
|
||||
return await chat(message, session_id=self.config.session_id)
|
||||
|
||||
# ── Main Loop ───────────────────────────────────────────────────────
|
||||
|
||||
def run(self) -> None:
|
||||
"""Run the voice loop. Blocks until Ctrl-C."""
|
||||
self._ensure_piper()
|
||||
|
||||
tts_label = (
|
||||
"macOS say"
|
||||
if self.config.use_say_fallback
|
||||
else f"Piper ({self.config.piper_voice.name})"
|
||||
)
|
||||
print(
|
||||
f"\n{'=' * 60}\n"
|
||||
f" 🎙️ Timmy Voice — Sovereign Voice Interface\n"
|
||||
f"{'=' * 60}\n"
|
||||
f" STT: Whisper ({self.config.whisper_model})\n"
|
||||
f" TTS: {tts_label}\n"
|
||||
f" LLM: Timmy (local Ollama)\n"
|
||||
f"{'=' * 60}\n"
|
||||
f" Speak naturally. Timmy will listen, think, and respond.\n"
|
||||
f" Press Ctrl-C to exit.\n"
|
||||
f"{'=' * 60}"
|
||||
)
|
||||
|
||||
self._running = True
|
||||
|
||||
try:
|
||||
while self._running:
|
||||
# 1. LISTEN — record until silence
|
||||
audio = self._record_utterance()
|
||||
if audio is None:
|
||||
continue
|
||||
|
||||
# 2. TRANSCRIBE — Whisper STT
|
||||
text = self._transcribe(audio)
|
||||
if not text or text.lower() in (
|
||||
"you",
|
||||
"thanks.",
|
||||
"thank you.",
|
||||
"bye.",
|
||||
"",
|
||||
"thanks for watching!",
|
||||
"thank you for watching!",
|
||||
):
|
||||
# Whisper hallucinations on silence/noise
|
||||
logger.debug("Ignoring likely Whisper hallucination: '%s'", text)
|
||||
continue
|
||||
|
||||
sys.stdout.write(f"\n 👤 You: {text}\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
# Exit commands
|
||||
if text.lower().strip().rstrip(".!") in (
|
||||
"goodbye",
|
||||
"exit",
|
||||
"quit",
|
||||
"stop",
|
||||
"goodbye timmy",
|
||||
"stop listening",
|
||||
):
|
||||
print("\n 👋 Goodbye!\n")
|
||||
break
|
||||
|
||||
# 3. THINK — send to Timmy
|
||||
response = self._think(text)
|
||||
sys.stdout.write(f" 🤖 Timmy: {response}\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
# 4. SPEAK — TTS output
|
||||
self._speak(response)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\n 👋 Voice loop stopped.\n")
|
||||
finally:
|
||||
self._running = False
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop the voice loop (from another thread)."""
|
||||
self._running = False
|
||||
273
tests/timmy/test_voice_loop.py
Normal file
273
tests/timmy/test_voice_loop.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""Tests for the sovereign voice loop.
|
||||
|
||||
These tests verify the VoiceLoop components without requiring a microphone,
|
||||
Whisper model, or Piper installation — all I/O is mocked.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
|
||||
from timmy.voice_loop import VoiceConfig, VoiceLoop
|
||||
|
||||
# ── VoiceConfig tests ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestVoiceConfig:
|
||||
def test_defaults(self):
|
||||
cfg = VoiceConfig()
|
||||
assert cfg.whisper_model == "base.en"
|
||||
assert cfg.sample_rate == 16000
|
||||
assert cfg.silence_threshold == 0.015
|
||||
assert cfg.silence_duration == 1.5
|
||||
assert cfg.min_utterance == 0.5
|
||||
assert cfg.max_utterance == 30.0
|
||||
assert cfg.session_id == "voice"
|
||||
assert cfg.use_say_fallback is False
|
||||
|
||||
def test_custom_values(self):
|
||||
cfg = VoiceConfig(
|
||||
whisper_model="tiny.en",
|
||||
silence_threshold=0.02,
|
||||
session_id="custom",
|
||||
use_say_fallback=True,
|
||||
)
|
||||
assert cfg.whisper_model == "tiny.en"
|
||||
assert cfg.silence_threshold == 0.02
|
||||
assert cfg.session_id == "custom"
|
||||
assert cfg.use_say_fallback is True
|
||||
|
||||
|
||||
# ── VoiceLoop unit tests ──────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestVoiceLoopInit:
|
||||
def test_default_config(self):
|
||||
loop = VoiceLoop()
|
||||
assert loop.config.whisper_model == "base.en"
|
||||
assert loop._running is False
|
||||
assert loop._speaking is False
|
||||
|
||||
def test_custom_config(self):
|
||||
cfg = VoiceConfig(whisper_model="tiny.en")
|
||||
loop = VoiceLoop(config=cfg)
|
||||
assert loop.config.whisper_model == "tiny.en"
|
||||
|
||||
|
||||
class TestPiperFallback:
|
||||
def test_falls_back_to_say_when_no_voice_file(self):
|
||||
cfg = VoiceConfig(piper_voice=Path("/nonexistent/voice.onnx"))
|
||||
loop = VoiceLoop(config=cfg)
|
||||
loop._ensure_piper()
|
||||
assert loop.config.use_say_fallback is True
|
||||
|
||||
def test_keeps_piper_when_voice_exists(self, tmp_path):
|
||||
voice_file = tmp_path / "test.onnx"
|
||||
voice_file.write_bytes(b"fake model")
|
||||
cfg = VoiceConfig(piper_voice=voice_file)
|
||||
loop = VoiceLoop(config=cfg)
|
||||
loop._ensure_piper()
|
||||
assert loop.config.use_say_fallback is False
|
||||
|
||||
|
||||
class TestTranscribe:
|
||||
def test_transcribes_audio(self):
|
||||
"""Whisper transcription returns cleaned text."""
|
||||
loop = VoiceLoop()
|
||||
|
||||
mock_model = MagicMock()
|
||||
mock_model.transcribe.return_value = {"text": " Hello Timmy "}
|
||||
loop._whisper_model = mock_model
|
||||
|
||||
audio = np.random.randn(16000).astype(np.float32)
|
||||
result = loop._transcribe(audio)
|
||||
|
||||
assert result == "Hello Timmy"
|
||||
mock_model.transcribe.assert_called_once()
|
||||
|
||||
def test_transcribes_empty_returns_empty(self):
|
||||
loop = VoiceLoop()
|
||||
mock_model = MagicMock()
|
||||
mock_model.transcribe.return_value = {"text": " "}
|
||||
loop._whisper_model = mock_model
|
||||
|
||||
audio = np.random.randn(16000).astype(np.float32)
|
||||
result = loop._transcribe(audio)
|
||||
assert result == ""
|
||||
|
||||
|
||||
class TestThink:
|
||||
@patch("timmy.voice_loop.asyncio")
|
||||
def test_think_returns_response(self, mock_asyncio):
|
||||
mock_asyncio.run.return_value = "I am Timmy."
|
||||
loop = VoiceLoop()
|
||||
result = loop._think("Who are you?")
|
||||
assert result == "I am Timmy."
|
||||
|
||||
@patch("timmy.voice_loop.asyncio")
|
||||
def test_think_handles_error(self, mock_asyncio):
|
||||
mock_asyncio.run.side_effect = RuntimeError("Ollama down")
|
||||
loop = VoiceLoop()
|
||||
result = loop._think("test")
|
||||
assert "trouble" in result.lower()
|
||||
|
||||
|
||||
class TestSpeakSay:
|
||||
@patch("subprocess.Popen")
|
||||
def test_speak_say_calls_subprocess(self, mock_popen):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.wait.return_value = 0
|
||||
mock_popen.return_value = mock_proc
|
||||
|
||||
cfg = VoiceConfig(use_say_fallback=True)
|
||||
loop = VoiceLoop(config=cfg)
|
||||
loop._speak_say("Hello")
|
||||
|
||||
mock_popen.assert_called_once()
|
||||
args = mock_popen.call_args[0][0]
|
||||
assert args[0] == "say"
|
||||
assert "Hello" in args
|
||||
|
||||
@patch("subprocess.Popen", side_effect=FileNotFoundError)
|
||||
def test_speak_say_handles_missing(self, mock_popen):
|
||||
cfg = VoiceConfig(use_say_fallback=True)
|
||||
loop = VoiceLoop(config=cfg)
|
||||
# Should not raise
|
||||
loop._speak_say("Hello")
|
||||
|
||||
|
||||
class TestSpeakPiper:
|
||||
@patch("timmy.voice_loop.VoiceLoop._play_audio")
|
||||
@patch("subprocess.run")
|
||||
def test_speak_piper_generates_and_plays(self, mock_run, mock_play):
|
||||
mock_run.return_value = MagicMock(returncode=0, stderr="")
|
||||
|
||||
voice_path = Path("/tmp/test_voice.onnx")
|
||||
cfg = VoiceConfig(piper_voice=voice_path)
|
||||
loop = VoiceLoop(config=cfg)
|
||||
loop._speak_piper("Hello from Piper")
|
||||
|
||||
# Piper was called
|
||||
mock_run.assert_called_once()
|
||||
cmd = mock_run.call_args[0][0]
|
||||
assert cmd[0] == "piper"
|
||||
assert "--model" in cmd
|
||||
|
||||
# Audio was played
|
||||
mock_play.assert_called_once()
|
||||
|
||||
@patch("timmy.voice_loop.VoiceLoop._speak_say")
|
||||
@patch("subprocess.run")
|
||||
def test_speak_piper_falls_back_on_error(self, mock_run, mock_say):
|
||||
mock_run.return_value = MagicMock(returncode=1, stderr="model error")
|
||||
|
||||
cfg = VoiceConfig(piper_voice=Path("/tmp/test.onnx"))
|
||||
loop = VoiceLoop(config=cfg)
|
||||
loop._speak_piper("test")
|
||||
|
||||
# Should fall back to say
|
||||
mock_say.assert_called_once_with("test")
|
||||
|
||||
|
||||
class TestHallucinationFilter:
|
||||
"""Whisper tends to hallucinate on silence/noise. The loop should filter these."""
|
||||
|
||||
def test_known_hallucinations_filtered(self):
|
||||
hallucinations = [
|
||||
"you",
|
||||
"thanks.",
|
||||
"Thank you.",
|
||||
"Bye.",
|
||||
"Thanks for watching!",
|
||||
"Thank you for watching!",
|
||||
]
|
||||
for text in hallucinations:
|
||||
assert text.lower() in (
|
||||
"you",
|
||||
"thanks.",
|
||||
"thank you.",
|
||||
"bye.",
|
||||
"",
|
||||
"thanks for watching!",
|
||||
"thank you for watching!",
|
||||
), f"'{text}' should be filtered"
|
||||
|
||||
|
||||
class TestExitCommands:
|
||||
"""Voice loop should recognize exit commands."""
|
||||
|
||||
def test_exit_commands(self):
|
||||
exits = ["goodbye", "exit", "quit", "stop", "goodbye timmy", "stop listening"]
|
||||
for cmd in exits:
|
||||
assert cmd.lower().strip().rstrip(".!") in (
|
||||
"goodbye",
|
||||
"exit",
|
||||
"quit",
|
||||
"stop",
|
||||
"goodbye timmy",
|
||||
"stop listening",
|
||||
), f"'{cmd}' should be an exit command"
|
||||
|
||||
|
||||
class TestPlayAudio:
|
||||
@patch("subprocess.Popen")
|
||||
def test_play_audio_calls_afplay(self, mock_popen):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.poll.side_effect = [None, 0] # Running, then done
|
||||
mock_popen.return_value = mock_proc
|
||||
|
||||
loop = VoiceLoop()
|
||||
loop._play_audio("/tmp/test.wav")
|
||||
|
||||
mock_popen.assert_called_once()
|
||||
args = mock_popen.call_args[0][0]
|
||||
assert args[0] == "afplay"
|
||||
|
||||
@patch("subprocess.Popen")
|
||||
def test_play_audio_interruptible(self, mock_popen):
|
||||
mock_proc = MagicMock()
|
||||
# Simulate running, then we interrupt
|
||||
call_count = 0
|
||||
|
||||
def poll_side_effect():
|
||||
nonlocal call_count
|
||||
call_count += 1
|
||||
return None # Always running
|
||||
|
||||
mock_proc.poll.side_effect = poll_side_effect
|
||||
mock_popen.return_value = mock_proc
|
||||
|
||||
loop = VoiceLoop()
|
||||
loop._interrupted = True # Pre-set interrupt
|
||||
loop._play_audio("/tmp/test.wav")
|
||||
|
||||
mock_proc.terminate.assert_called_once()
|
||||
|
||||
|
||||
class TestStopMethod:
|
||||
def test_stop_sets_running_false(self):
|
||||
loop = VoiceLoop()
|
||||
loop._running = True
|
||||
loop.stop()
|
||||
assert loop._running is False
|
||||
|
||||
|
||||
class TestSpeakSetsFlag:
|
||||
@patch("timmy.voice_loop.VoiceLoop._speak_say")
|
||||
def test_speaking_flag_set_during_speech(self, mock_say):
|
||||
cfg = VoiceConfig(use_say_fallback=True)
|
||||
loop = VoiceLoop(config=cfg)
|
||||
|
||||
# Before speak
|
||||
assert loop._speaking is False
|
||||
|
||||
# Mock say to check flag during execution
|
||||
def check_flag(text):
|
||||
assert loop._speaking is True
|
||||
|
||||
mock_say.side_effect = check_flag
|
||||
loop._speak("Hello")
|
||||
|
||||
# After speak
|
||||
assert loop._speaking is False
|
||||
Reference in New Issue
Block a user