gglib_core/ports/
voice.rs

1//! Voice pipeline port — trait abstraction for voice data & config operations.
2//!
3//! # Design Rules
4//!
5//! - DTOs here are transport-agnostic wire shapes (no `gglib-voice` types).
6//! - Conversion from `gglib-voice` native types happens inside `gglib-voice`,
7//!   never here. This keeps `gglib-core` free of any dependency on `gglib-voice`.
8//! - `VoicePipelinePort` is the only surface `gglib-gui` and `gglib-axum`
9//!   need in order to serve all 13 voice data/config endpoints.
10
11use async_trait::async_trait;
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15// ── DTOs ─────────────────────────────────────────────────────────────────────
16
17/// Current state of the voice pipeline.
18// Wire-shape DTO: the four bools represent distinct pipeline state flags
19// (is_active, stt_loaded, tts_loaded, auto_speak) that have clear, independent
20// meanings. There is no sensible state-machine or enum grouping that would
21// improve clarity for callers reading the JSON payload.
22#[allow(clippy::struct_excessive_bools)]
23#[derive(Debug, Clone, Serialize, Deserialize)]
24#[serde(rename_all = "camelCase")]
25pub struct VoiceStatusDto {
26    /// Whether the pipeline is actively capturing/processing audio.
27    pub is_active: bool,
28    /// State machine label (e.g. `"idle"`, `"listening"`, `"recording"`).
29    pub state: String,
30    /// Interaction mode label (`"ptt"` or `"vad"`).
31    pub mode: String,
32    /// Whether an STT engine is loaded.
33    pub stt_loaded: bool,
34    /// Whether a TTS engine is loaded.
35    pub tts_loaded: bool,
36    /// ID of the currently loaded STT model, if any.
37    pub stt_model_id: Option<String>,
38    /// Currently selected TTS voice, if loaded.
39    pub tts_voice: Option<String>,
40    /// Whether LLM responses are spoken automatically.
41    pub auto_speak: bool,
42}
43
44/// Information about a single STT model.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46#[serde(rename_all = "camelCase")]
47pub struct SttModelInfoDto {
48    /// Model identifier (e.g. `"base.en"`).
49    pub id: String,
50    /// Human-readable name.
51    pub name: String,
52    /// Download size in bytes.
53    pub size_bytes: u64,
54    /// Human-readable size string.
55    pub size_display: String,
56    /// Whether this model is English-only.
57    pub english_only: bool,
58    /// Quality rating (1–5).
59    pub quality: u8,
60    /// Relative speed rating (1 = fastest).
61    pub speed: u8,
62    /// Whether this is the recommended default model.
63    pub is_default: bool,
64    /// Whether the model archive is already present on disk.
65    pub is_downloaded: bool,
66}
67
68/// Information about the TTS model bundle.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70#[serde(rename_all = "camelCase")]
71pub struct TtsModelInfoDto {
72    /// Model identifier.
73    pub id: String,
74    /// Human-readable name.
75    pub name: String,
76    /// Download size in bytes.
77    pub size_bytes: u64,
78    /// Human-readable size string.
79    pub size_display: String,
80    /// Number of available voices in this bundle.
81    pub voice_count: u32,
82    /// Whether the model archive is already present on disk.
83    pub is_downloaded: bool,
84}
85
86/// Information about a single TTS voice.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88#[serde(rename_all = "camelCase")]
89pub struct VoiceInfoDto {
90    /// Voice identifier used in API calls.
91    pub id: String,
92    /// Human-readable display name.
93    pub name: String,
94    /// Language/accent category.
95    pub category: String,
96}
97
98/// Aggregated voice model catalog: STT list, TTS bundle, and VAD status.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100#[serde(rename_all = "camelCase")]
101pub struct VoiceModelsDto {
102    /// All known STT models with download status.
103    pub stt_models: Vec<SttModelInfoDto>,
104    /// The single TTS model bundle with download status.
105    pub tts_model: TtsModelInfoDto,
106    /// Whether the Silero VAD model is downloaded.
107    pub vad_downloaded: bool,
108    /// Available TTS voices (populated when TTS model is loaded).
109    pub voices: Vec<VoiceInfoDto>,
110}
111
112/// Information about an audio input device visible to the OS.
113#[derive(Debug, Clone, Serialize, Deserialize)]
114#[serde(rename_all = "camelCase")]
115pub struct AudioDeviceDto {
116    /// Human-readable device name.
117    pub name: String,
118    /// Whether this is the system default input device.
119    pub is_default: bool,
120}
121
122// ── Error ─────────────────────────────────────────────────────────────────────
123
124/// Errors returned by `VoicePipelinePort` operations.
125///
126/// These map deterministically to `GuiError` variants, which in turn map to
127/// HTTP status codes via the existing `From<GuiError> for HttpError` impl.
128#[derive(Debug, Error)]
129pub enum VoicePortError {
130    /// The voice pipeline has not been initialised yet (no model loaded).
131    #[error("Voice pipeline not initialised — load an STT or TTS model first")]
132    NotInitialised,
133
134    /// The pipeline is already in an active streaming state.
135    #[error("Voice pipeline is already active")]
136    AlreadyActive,
137
138    /// The pipeline is initialised (models loaded) but has not been started.
139    ///
140    /// The caller should POST to `/api/voice/start` before calling audio I/O
141    /// operations.  Maps to HTTP 409 Conflict.
142    #[error("Voice pipeline is not active — call /api/voice/start first")]
143    NotActive,
144
145    /// A requested resource (model, device) was not found.
146    #[error("Not found: {0}")]
147    NotFound(String),
148
149    /// A model failed to load (model file corrupt, incompatible format, etc.).
150    #[error("Load error: {0}")]
151    LoadError(String),
152
153    /// A model download failed (network, disk, archive extraction).
154    #[error("Download error: {0}")]
155    DownloadError(String),
156
157    /// Unexpected internal error.
158    #[error("Internal voice error: {0}")]
159    Internal(String),
160
161    /// Feature not yet implemented.
162    ///
163    /// Maps to HTTP 400 Bad Request via `GuiError::ValidationFailed`.  400 is
164    /// intentionally preferred over 501 here because the error is *actionable*
165    /// — the caller should change the request (e.g. switch from VAD to PTT
166    /// mode) rather than interpret it as a transient server-side gap.
167    #[error("Not implemented: {0}")]
168    Unimplemented(String),
169}
170
171// ── Port trait ────────────────────────────────────────────────────────────────
172
173/// Port trait for voice data, configuration, and audio I/O operations.
174///
175/// Implemented by `VoiceService` in `gglib-voice`.
176/// Consumed by `VoiceOps` in `gglib-gui` and delegated to by Axum handlers.
177///
178/// # Scope
179///
180/// This trait covers all 19 voice operations:
181/// - **13 data/config** endpoints (no audio hardware required, curl-testable)
182/// - **6 audio I/O** endpoints (`start`, `stop`, `ptt-start`, `ptt-stop`,
183///   `speak`, `stop-speaking`)
184#[async_trait]
185pub trait VoicePipelinePort: Send + Sync {
186    /// Return the current pipeline status (state machine, loaded models, etc.).
187    async fn status(&self) -> Result<VoiceStatusDto, VoicePortError>;
188
189    /// Return the full model catalog with per-model download status.
190    async fn list_models(&self) -> Result<VoiceModelsDto, VoicePortError>;
191
192    /// Download an STT model archive by ID (e.g. `"base.en"`).
193    async fn download_stt_model(&self, model_id: &str) -> Result<(), VoicePortError>;
194
195    /// Download the TTS model archive.
196    async fn download_tts_model(&self) -> Result<(), VoicePortError>;
197
198    /// Download the Silero VAD model.
199    async fn download_vad_model(&self) -> Result<(), VoicePortError>;
200
201    /// Load a downloaded STT model into the pipeline by ID.
202    async fn load_stt(&self, model_id: &str) -> Result<(), VoicePortError>;
203
204    /// Load the downloaded TTS model into the pipeline.
205    async fn load_tts(&self) -> Result<(), VoicePortError>;
206
207    /// Set the interaction mode (`"ptt"` | `"vad"`).
208    async fn set_mode(&self, mode: &str) -> Result<(), VoicePortError>;
209
210    /// Set the TTS voice by ID.
211    async fn set_voice(&self, voice_id: &str) -> Result<(), VoicePortError>;
212
213    /// Set the TTS playback speed (1.0 = normal).
214    async fn set_speed(&self, speed: f32) -> Result<(), VoicePortError>;
215
216    /// Enable or disable automatic TTS for LLM responses.
217    async fn set_auto_speak(&self, enabled: bool) -> Result<(), VoicePortError>;
218
219    /// Stop audio I/O and release all model memory.
220    async fn unload(&self) -> Result<(), VoicePortError>;
221
222    /// List available audio input devices.
223    async fn list_devices(&self) -> Result<Vec<AudioDeviceDto>, VoicePortError>;
224
225    // ── Audio I/O ────────────────────────────────────────────────────────────
226
227    /// Start the voice pipeline audio I/O.
228    ///
229    /// `mode` overrides the current interaction mode for this session
230    /// (`"ptt"` | `"vad"`).  When `None`, the previously configured mode
231    /// is used.
232    ///
233    /// Returns [`VoicePortError::NotInitialised`] if no STT model is loaded.
234    /// Returns [`VoicePortError::AlreadyActive`] if the pipeline is already
235    /// running.
236    async fn start(&self, mode: Option<String>) -> Result<(), VoicePortError>;
237
238    /// Stop audio I/O, releasing mic + playback resources, but keep STT/TTS
239    /// models warm so the user can restart without a reload delay.
240    async fn stop(&self) -> Result<(), VoicePortError>;
241
242    /// Begin PTT recording (user pressed the talk button).
243    ///
244    /// Returns [`VoicePortError::NotInitialised`] if the pipeline is not
245    /// active.
246    async fn ptt_start(&self) -> Result<(), VoicePortError>;
247
248    /// End PTT recording and transcribe the captured audio.
249    ///
250    /// Returns the transcript text (empty string if no speech was detected).
251    async fn ptt_stop(&self) -> Result<String, VoicePortError>;
252
253    /// Synthesize `text` via TTS and stream the audio to the speaker.
254    ///
255    /// This is an asynchronous operation: implementations may perform
256    /// synthesis and playback work while this future is pending.  Callers
257    /// MUST NOT assume that it returns immediately after dispatch.
258    /// `VoiceEvent::SpeakingStarted` / `SpeakingFinished` are emitted via
259    /// the SSE event bus to report speaking lifecycle events.
260    async fn speak(&self, text: &str) -> Result<(), VoicePortError>;
261
262    /// Interrupt any active TTS playback immediately.
263    async fn stop_speaking(&self) -> Result<(), VoicePortError>;
264}