gglib_core/ports/process_runner.rs
1//! Process runner trait definition.
2//!
3//! This port defines the interface for managing model server processes.
4//! Implementations handle all process lifecycle details internally.
5
6use async_trait::async_trait;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9
10use super::ProcessError;
11use crate::domain::InferenceConfig;
12
13/// Configuration for starting a model server.
14///
15/// This is an intent-based configuration — it expresses what the caller
16/// wants, not how the server should be started. All typed fields are
17/// handled by `build_and_spawn()`; `extra_args` is an escape hatch for
18/// flags not yet promoted to first-class fields.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ServerConfig {
21 /// Database ID of the model to serve.
22 pub model_id: i64,
23 /// Human-readable model name.
24 pub model_name: String,
25 /// Path to the model file.
26 pub model_path: PathBuf,
27 /// Port to listen on (if None, a free port will be assigned).
28 pub port: Option<u16>,
29 /// Base port for allocation when port is None.
30 pub base_port: u16,
31 /// Context size to use (if None, use model default).
32 pub context_size: Option<u64>,
33 /// Number of GPU layers to offload (if None, use default).
34 pub gpu_layers: Option<i32>,
35 /// Enable Jinja templating for chat formats.
36 pub jinja: bool,
37 /// Reasoning format override (e.g., `"deepseek"`, `"none"`).
38 pub reasoning_format: Option<String>,
39 /// Number of MTP draft tokens to speculate ahead (`--spec-draft-n-max`).
40 ///
41 /// `None` means MTP speculative decoding is disabled. When `Some(n)`,
42 /// `--spec-type draft-mtp` and `--spec-draft-n-max n` are passed to
43 /// llama-server. Recommended value: `2` (Unsloth default).
44 pub spec_draft_n_max: Option<u32>,
45 /// Minimum acceptance probability for MTP draft tokens (`--spec-draft-p-min`).
46 ///
47 /// Only meaningful when `spec_draft_n_max` is `Some`. Skipping low-confidence
48 /// draft tokens is especially important on Apple Silicon (Metal) to avoid
49 /// throughput regression. Recommended value: `0.75`.
50 pub spec_draft_p_min: Option<f32>,
51 /// Inference sampling parameters (temperature, `top_p`, etc.).
52 pub inference_config: Option<InferenceConfig>,
53 /// Additional server-specific options (escape hatch).
54 pub extra_args: Vec<String>,
55}
56
57impl ServerConfig {
58 /// Create a new server configuration with required fields.
59 #[must_use]
60 pub const fn new(
61 model_id: i64,
62 model_name: String,
63 model_path: PathBuf,
64 base_port: u16,
65 ) -> Self {
66 Self {
67 model_id,
68 model_name,
69 model_path,
70 port: None,
71 base_port,
72 context_size: None,
73 gpu_layers: None,
74 jinja: false,
75 reasoning_format: None,
76 spec_draft_n_max: None,
77 spec_draft_p_min: None,
78 inference_config: None,
79 extra_args: Vec::new(),
80 }
81 }
82
83 /// Set the port to listen on.
84 #[must_use]
85 pub const fn with_port(mut self, port: u16) -> Self {
86 self.port = Some(port);
87 self
88 }
89
90 /// Set the context size.
91 #[must_use]
92 pub const fn with_context_size(mut self, size: u64) -> Self {
93 self.context_size = Some(size);
94 self
95 }
96
97 /// Set the number of GPU layers.
98 #[must_use]
99 pub const fn with_gpu_layers(mut self, layers: i32) -> Self {
100 self.gpu_layers = Some(layers);
101 self
102 }
103
104 /// Enable Jinja templating.
105 #[must_use]
106 pub const fn with_jinja(mut self) -> Self {
107 self.jinja = true;
108 self
109 }
110
111 /// Set the reasoning format (e.g., `"deepseek"`, `"none"`).
112 #[must_use]
113 pub fn with_reasoning_format(mut self, format: String) -> Self {
114 self.reasoning_format = Some(format);
115 self
116 }
117
118 /// Enable MTP speculative decoding with the given draft token count.
119 ///
120 /// This causes `--spec-type draft-mtp` and `--spec-draft-n-max n` to be
121 /// passed to llama-server. Call [`Self::with_spec_draft_p_min`] to also
122 /// set the acceptance probability threshold (defaults to 0.75).
123 #[must_use]
124 pub const fn with_spec_draft_n_max(mut self, n: u32) -> Self {
125 self.spec_draft_n_max = Some(n);
126 self
127 }
128
129 /// Set the minimum acceptance probability for MTP draft tokens.
130 ///
131 /// Has no effect unless `spec_draft_n_max` is also set. Recommended
132 /// value is `0.75`; lower values trade quality for speed.
133 #[must_use]
134 pub const fn with_spec_draft_p_min(mut self, p: f32) -> Self {
135 self.spec_draft_p_min = Some(p);
136 self
137 }
138
139 /// Set inference sampling parameters.
140 #[must_use]
141 pub const fn with_inference_config(mut self, config: InferenceConfig) -> Self {
142 self.inference_config = Some(config);
143 self
144 }
145
146 /// Add extra arguments to pass to the server.
147 #[must_use]
148 pub fn with_extra_args(mut self, args: Vec<String>) -> Self {
149 self.extra_args = args;
150 self
151 }
152}
153
154/// Handle to a running server process.
155///
156/// This is an opaque handle that implementations use to track processes.
157/// It contains enough information to identify and manage the process.
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct ProcessHandle {
160 /// Database ID of the model being served.
161 pub model_id: i64,
162 /// Human-readable model name.
163 pub model_name: String,
164 /// Process ID (if running on local system).
165 pub pid: Option<u32>,
166 /// Port the server is listening on.
167 pub port: u16,
168 /// Unix timestamp (seconds) when the server was started.
169 pub started_at: u64,
170}
171
172impl ProcessHandle {
173 /// Create a new process handle.
174 #[must_use]
175 pub const fn new(
176 model_id: i64,
177 model_name: String,
178 pid: Option<u32>,
179 port: u16,
180 started_at: u64,
181 ) -> Self {
182 Self {
183 model_id,
184 model_name,
185 pid,
186 port,
187 started_at,
188 }
189 }
190}
191
192/// Health status of a running server.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct ServerHealth {
195 /// Whether the server is responding to health checks.
196 pub healthy: bool,
197 /// Unix timestamp (seconds) of the last successful health check.
198 pub last_check: Option<u64>,
199 /// Context size being used by the server.
200 pub context_size: Option<u64>,
201 /// Optional status message.
202 pub message: Option<String>,
203}
204
205impl ServerHealth {
206 /// Get the current Unix timestamp in seconds.
207 fn now_secs() -> u64 {
208 std::time::SystemTime::now()
209 .duration_since(std::time::UNIX_EPOCH)
210 .unwrap()
211 .as_secs()
212 }
213
214 /// Create a healthy server status.
215 #[must_use]
216 pub fn healthy() -> Self {
217 Self {
218 healthy: true,
219 last_check: Some(Self::now_secs()),
220 context_size: None,
221 message: None,
222 }
223 }
224
225 /// Create an unhealthy server status with a message.
226 pub fn unhealthy(message: impl Into<String>) -> Self {
227 Self {
228 healthy: false,
229 last_check: Some(Self::now_secs()),
230 context_size: None,
231 message: Some(message.into()),
232 }
233 }
234
235 /// Set the context size.
236 #[must_use]
237 pub const fn with_context_size(mut self, size: u64) -> Self {
238 self.context_size = Some(size);
239 self
240 }
241}
242
243/// Process runner for managing model server processes.
244///
245/// This trait abstracts process management for testability and
246/// potential alternative backends (local, remote, containerized).
247///
248/// # Design Rules
249///
250/// - Express **intent**, not implementation detail
251/// - No CLI/Tauri/Axum concerns in signatures
252/// - Must support: mock runner, remote runner, alternative inference backends
253#[async_trait]
254pub trait ProcessRunner: Send + Sync {
255 /// Start a model server with the given configuration.
256 ///
257 /// Returns a handle that can be used to manage the process.
258 async fn start(&self, config: ServerConfig) -> Result<ProcessHandle, ProcessError>;
259
260 /// Stop a running server.
261 ///
262 /// Returns `Err(ProcessError::NotRunning)` if the process isn't running.
263 async fn stop(&self, handle: &ProcessHandle) -> Result<(), ProcessError>;
264
265 /// Check if a server is still running.
266 async fn is_running(&self, handle: &ProcessHandle) -> bool;
267
268 /// Get the health status of a running server.
269 ///
270 /// Returns `Err(ProcessError::NotRunning)` if the process isn't running.
271 async fn health(&self, handle: &ProcessHandle) -> Result<ServerHealth, ProcessError>;
272
273 /// List all currently running server processes.
274 ///
275 /// This is needed for snapshot behavior (e.g., `server:snapshot` events).
276 async fn list_running(&self) -> Result<Vec<ProcessHandle>, ProcessError>;
277}