gglib_core/ports/voice.rs
1//! Voice pipeline port — trait abstraction for voice data & config operations.
2//!
3//! # Design Rules
4//!
5//! - DTOs here are transport-agnostic wire shapes (no `gglib-voice` types).
6//! - Conversion from `gglib-voice` native types happens inside `gglib-voice`,
7//! never here. This keeps `gglib-core` free of any dependency on `gglib-voice`.
8//! - `VoicePipelinePort` is the only surface `gglib-gui` and `gglib-axum`
9//! need in order to serve all 13 voice data/config endpoints.
10
11use async_trait::async_trait;
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15// ── DTOs ─────────────────────────────────────────────────────────────────────
16
17/// Current state of the voice pipeline.
18// Wire-shape DTO: the four bools represent distinct pipeline state flags
19// (is_active, stt_loaded, tts_loaded, auto_speak) that have clear, independent
20// meanings. There is no sensible state-machine or enum grouping that would
21// improve clarity for callers reading the JSON payload.
22#[allow(clippy::struct_excessive_bools)]
23#[derive(Debug, Clone, Serialize, Deserialize)]
24#[serde(rename_all = "camelCase")]
25pub struct VoiceStatusDto {
26 /// Whether the pipeline is actively capturing/processing audio.
27 pub is_active: bool,
28 /// State machine label (e.g. `"idle"`, `"listening"`, `"recording"`).
29 pub state: String,
30 /// Interaction mode label (`"ptt"` or `"vad"`).
31 pub mode: String,
32 /// Whether an STT engine is loaded.
33 pub stt_loaded: bool,
34 /// Whether a TTS engine is loaded.
35 pub tts_loaded: bool,
36 /// ID of the currently loaded STT model, if any.
37 pub stt_model_id: Option<String>,
38 /// Currently selected TTS voice, if loaded.
39 pub tts_voice: Option<String>,
40 /// Whether LLM responses are spoken automatically.
41 pub auto_speak: bool,
42}
43
44/// Information about a single STT model.
45#[derive(Debug, Clone, Serialize, Deserialize)]
46#[serde(rename_all = "camelCase")]
47pub struct SttModelInfoDto {
48 /// Model identifier (e.g. `"base.en"`).
49 pub id: String,
50 /// Human-readable name.
51 pub name: String,
52 /// Download size in bytes.
53 pub size_bytes: u64,
54 /// Human-readable size string.
55 pub size_display: String,
56 /// Whether this model is English-only.
57 pub english_only: bool,
58 /// Quality rating (1–5).
59 pub quality: u8,
60 /// Relative speed rating (1 = fastest).
61 pub speed: u8,
62 /// Whether this is the recommended default model.
63 pub is_default: bool,
64 /// Whether the model archive is already present on disk.
65 pub is_downloaded: bool,
66}
67
68/// Information about the TTS model bundle.
69#[derive(Debug, Clone, Serialize, Deserialize)]
70#[serde(rename_all = "camelCase")]
71pub struct TtsModelInfoDto {
72 /// Model identifier.
73 pub id: String,
74 /// Human-readable name.
75 pub name: String,
76 /// Download size in bytes.
77 pub size_bytes: u64,
78 /// Human-readable size string.
79 pub size_display: String,
80 /// Number of available voices in this bundle.
81 pub voice_count: u32,
82 /// Whether the model archive is already present on disk.
83 pub is_downloaded: bool,
84}
85
86/// Information about a single TTS voice.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88#[serde(rename_all = "camelCase")]
89pub struct VoiceInfoDto {
90 /// Voice identifier used in API calls.
91 pub id: String,
92 /// Human-readable display name.
93 pub name: String,
94 /// Language/accent category.
95 pub category: String,
96}
97
98/// Aggregated voice model catalog: STT list, TTS bundle, and VAD status.
99#[derive(Debug, Clone, Serialize, Deserialize)]
100#[serde(rename_all = "camelCase")]
101pub struct VoiceModelsDto {
102 /// All known STT models with download status.
103 pub stt_models: Vec<SttModelInfoDto>,
104 /// The single TTS model bundle with download status.
105 pub tts_model: TtsModelInfoDto,
106 /// Whether the Silero VAD model is downloaded.
107 pub vad_downloaded: bool,
108 /// Available TTS voices (populated when TTS model is loaded).
109 pub voices: Vec<VoiceInfoDto>,
110}
111
112/// Information about an audio input device visible to the OS.
113#[derive(Debug, Clone, Serialize, Deserialize)]
114#[serde(rename_all = "camelCase")]
115pub struct AudioDeviceDto {
116 /// Human-readable device name.
117 pub name: String,
118 /// Whether this is the system default input device.
119 pub is_default: bool,
120}
121
122// ── Error ─────────────────────────────────────────────────────────────────────
123
124/// Errors returned by `VoicePipelinePort` operations.
125///
126/// These map deterministically to `GuiError` variants, which in turn map to
127/// HTTP status codes via the existing `From<GuiError> for HttpError` impl.
128#[derive(Debug, Error)]
129pub enum VoicePortError {
130 /// The voice pipeline has not been initialised yet (no model loaded).
131 #[error("Voice pipeline not initialised — load an STT or TTS model first")]
132 NotInitialised,
133
134 /// The pipeline is already in an active streaming state.
135 #[error("Voice pipeline is already active")]
136 AlreadyActive,
137
138 /// The pipeline is initialised (models loaded) but has not been started.
139 ///
140 /// The caller should POST to `/api/voice/start` before calling audio I/O
141 /// operations. Maps to HTTP 409 Conflict.
142 #[error("Voice pipeline is not active — call /api/voice/start first")]
143 NotActive,
144
145 /// A requested resource (model, device) was not found.
146 #[error("Not found: {0}")]
147 NotFound(String),
148
149 /// A model failed to load (model file corrupt, incompatible format, etc.).
150 #[error("Load error: {0}")]
151 LoadError(String),
152
153 /// A model download failed (network, disk, archive extraction).
154 #[error("Download error: {0}")]
155 DownloadError(String),
156
157 /// Unexpected internal error.
158 #[error("Internal voice error: {0}")]
159 Internal(String),
160
161 /// Feature not yet implemented.
162 ///
163 /// Maps to HTTP 400 Bad Request via `GuiError::ValidationFailed`. 400 is
164 /// intentionally preferred over 501 here because the error is *actionable*
165 /// — the caller should change the request (e.g. switch from VAD to PTT
166 /// mode) rather than interpret it as a transient server-side gap.
167 #[error("Not implemented: {0}")]
168 Unimplemented(String),
169}
170
171// ── Port trait ────────────────────────────────────────────────────────────────
172
173/// Port trait for voice data, configuration, and audio I/O operations.
174///
175/// Implemented by `VoiceService` in `gglib-voice`.
176/// Consumed by `VoiceOps` in `gglib-gui` and delegated to by Axum handlers.
177///
178/// # Scope
179///
180/// This trait covers all 19 voice operations:
181/// - **13 data/config** endpoints (no audio hardware required, curl-testable)
182/// - **6 audio I/O** endpoints (`start`, `stop`, `ptt-start`, `ptt-stop`,
183/// `speak`, `stop-speaking`)
184#[async_trait]
185pub trait VoicePipelinePort: Send + Sync {
186 /// Return the current pipeline status (state machine, loaded models, etc.).
187 async fn status(&self) -> Result<VoiceStatusDto, VoicePortError>;
188
189 /// Return the full model catalog with per-model download status.
190 async fn list_models(&self) -> Result<VoiceModelsDto, VoicePortError>;
191
192 /// Download an STT model archive by ID (e.g. `"base.en"`).
193 async fn download_stt_model(&self, model_id: &str) -> Result<(), VoicePortError>;
194
195 /// Download the TTS model archive.
196 async fn download_tts_model(&self) -> Result<(), VoicePortError>;
197
198 /// Download the Silero VAD model.
199 async fn download_vad_model(&self) -> Result<(), VoicePortError>;
200
201 /// Load a downloaded STT model into the pipeline by ID.
202 async fn load_stt(&self, model_id: &str) -> Result<(), VoicePortError>;
203
204 /// Load the downloaded TTS model into the pipeline.
205 async fn load_tts(&self) -> Result<(), VoicePortError>;
206
207 /// Set the interaction mode (`"ptt"` | `"vad"`).
208 async fn set_mode(&self, mode: &str) -> Result<(), VoicePortError>;
209
210 /// Set the TTS voice by ID.
211 async fn set_voice(&self, voice_id: &str) -> Result<(), VoicePortError>;
212
213 /// Set the TTS playback speed (1.0 = normal).
214 async fn set_speed(&self, speed: f32) -> Result<(), VoicePortError>;
215
216 /// Enable or disable automatic TTS for LLM responses.
217 async fn set_auto_speak(&self, enabled: bool) -> Result<(), VoicePortError>;
218
219 /// Stop audio I/O and release all model memory.
220 async fn unload(&self) -> Result<(), VoicePortError>;
221
222 /// List available audio input devices.
223 async fn list_devices(&self) -> Result<Vec<AudioDeviceDto>, VoicePortError>;
224
225 // ── Audio I/O ────────────────────────────────────────────────────────────
226
227 /// Start the voice pipeline audio I/O.
228 ///
229 /// `mode` overrides the current interaction mode for this session
230 /// (`"ptt"` | `"vad"`). When `None`, the previously configured mode
231 /// is used.
232 ///
233 /// Returns [`VoicePortError::NotInitialised`] if no STT model is loaded.
234 /// Returns [`VoicePortError::AlreadyActive`] if the pipeline is already
235 /// running.
236 async fn start(&self, mode: Option<String>) -> Result<(), VoicePortError>;
237
238 /// Stop audio I/O, releasing mic + playback resources, but keep STT/TTS
239 /// models warm so the user can restart without a reload delay.
240 async fn stop(&self) -> Result<(), VoicePortError>;
241
242 /// Begin PTT recording (user pressed the talk button).
243 ///
244 /// Returns [`VoicePortError::NotInitialised`] if the pipeline is not
245 /// active.
246 async fn ptt_start(&self) -> Result<(), VoicePortError>;
247
248 /// End PTT recording and transcribe the captured audio.
249 ///
250 /// Returns the transcript text (empty string if no speech was detected).
251 async fn ptt_stop(&self) -> Result<String, VoicePortError>;
252
253 /// Synthesize `text` via TTS and stream the audio to the speaker.
254 ///
255 /// This is an asynchronous operation: implementations may perform
256 /// synthesis and playback work while this future is pending. Callers
257 /// MUST NOT assume that it returns immediately after dispatch.
258 /// `VoiceEvent::SpeakingStarted` / `SpeakingFinished` are emitted via
259 /// the SSE event bus to report speaking lifecycle events.
260 async fn speak(&self, text: &str) -> Result<(), VoicePortError>;
261
262 /// Interrupt any active TTS playback immediately.
263 async fn stop_speaking(&self) -> Result<(), VoicePortError>;
264}