gglib_core/ports/
process_runner.rs

1//! Process runner trait definition.
2//!
3//! This port defines the interface for managing model server processes.
4//! Implementations handle all process lifecycle details internally.
5
6use async_trait::async_trait;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9
10use super::ProcessError;
11use crate::domain::InferenceConfig;
12
13/// Configuration for starting a model server.
14///
15/// This is an intent-based configuration — it expresses what the caller
16/// wants, not how the server should be started. All typed fields are
17/// handled by `build_and_spawn()`; `extra_args` is an escape hatch for
18/// flags not yet promoted to first-class fields.
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct ServerConfig {
21    /// Database ID of the model to serve.
22    pub model_id: i64,
23    /// Human-readable model name.
24    pub model_name: String,
25    /// Path to the model file.
26    pub model_path: PathBuf,
27    /// Port to listen on (if None, a free port will be assigned).
28    pub port: Option<u16>,
29    /// Base port for allocation when port is None.
30    pub base_port: u16,
31    /// Context size to use (if None, use model default).
32    pub context_size: Option<u64>,
33    /// Number of GPU layers to offload (if None, use default).
34    pub gpu_layers: Option<i32>,
35    /// Enable Jinja templating for chat formats.
36    pub jinja: bool,
37    /// Reasoning format override (e.g., `"deepseek"`, `"none"`).
38    pub reasoning_format: Option<String>,
39    /// Number of MTP draft tokens to speculate ahead (`--spec-draft-n-max`).
40    ///
41    /// `None` means MTP speculative decoding is disabled.  When `Some(n)`,
42    /// `--spec-type draft-mtp` and `--spec-draft-n-max n` are passed to
43    /// llama-server.  Recommended value: `2` (Unsloth default).
44    pub spec_draft_n_max: Option<u32>,
45    /// Minimum acceptance probability for MTP draft tokens (`--spec-draft-p-min`).
46    ///
47    /// Only meaningful when `spec_draft_n_max` is `Some`.  Skipping low-confidence
48    /// draft tokens is especially important on Apple Silicon (Metal) to avoid
49    /// throughput regression.  Recommended value: `0.75`.
50    pub spec_draft_p_min: Option<f32>,
51    /// Inference sampling parameters (temperature, `top_p`, etc.).
52    pub inference_config: Option<InferenceConfig>,
53    /// Additional server-specific options (escape hatch).
54    pub extra_args: Vec<String>,
55}
56
57impl ServerConfig {
58    /// Create a new server configuration with required fields.
59    #[must_use]
60    pub const fn new(
61        model_id: i64,
62        model_name: String,
63        model_path: PathBuf,
64        base_port: u16,
65    ) -> Self {
66        Self {
67            model_id,
68            model_name,
69            model_path,
70            port: None,
71            base_port,
72            context_size: None,
73            gpu_layers: None,
74            jinja: false,
75            reasoning_format: None,
76            spec_draft_n_max: None,
77            spec_draft_p_min: None,
78            inference_config: None,
79            extra_args: Vec::new(),
80        }
81    }
82
83    /// Set the port to listen on.
84    #[must_use]
85    pub const fn with_port(mut self, port: u16) -> Self {
86        self.port = Some(port);
87        self
88    }
89
90    /// Set the context size.
91    #[must_use]
92    pub const fn with_context_size(mut self, size: u64) -> Self {
93        self.context_size = Some(size);
94        self
95    }
96
97    /// Set the number of GPU layers.
98    #[must_use]
99    pub const fn with_gpu_layers(mut self, layers: i32) -> Self {
100        self.gpu_layers = Some(layers);
101        self
102    }
103
104    /// Enable Jinja templating.
105    #[must_use]
106    pub const fn with_jinja(mut self) -> Self {
107        self.jinja = true;
108        self
109    }
110
111    /// Set the reasoning format (e.g., `"deepseek"`, `"none"`).
112    #[must_use]
113    pub fn with_reasoning_format(mut self, format: String) -> Self {
114        self.reasoning_format = Some(format);
115        self
116    }
117
118    /// Enable MTP speculative decoding with the given draft token count.
119    ///
120    /// This causes `--spec-type draft-mtp` and `--spec-draft-n-max n` to be
121    /// passed to llama-server.  Call [`Self::with_spec_draft_p_min`] to also
122    /// set the acceptance probability threshold (defaults to 0.75).
123    #[must_use]
124    pub const fn with_spec_draft_n_max(mut self, n: u32) -> Self {
125        self.spec_draft_n_max = Some(n);
126        self
127    }
128
129    /// Set the minimum acceptance probability for MTP draft tokens.
130    ///
131    /// Has no effect unless `spec_draft_n_max` is also set.  Recommended
132    /// value is `0.75`; lower values trade quality for speed.
133    #[must_use]
134    pub const fn with_spec_draft_p_min(mut self, p: f32) -> Self {
135        self.spec_draft_p_min = Some(p);
136        self
137    }
138
139    /// Set inference sampling parameters.
140    #[must_use]
141    pub const fn with_inference_config(mut self, config: InferenceConfig) -> Self {
142        self.inference_config = Some(config);
143        self
144    }
145
146    /// Add extra arguments to pass to the server.
147    #[must_use]
148    pub fn with_extra_args(mut self, args: Vec<String>) -> Self {
149        self.extra_args = args;
150        self
151    }
152}
153
154/// Handle to a running server process.
155///
156/// This is an opaque handle that implementations use to track processes.
157/// It contains enough information to identify and manage the process.
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct ProcessHandle {
160    /// Database ID of the model being served.
161    pub model_id: i64,
162    /// Human-readable model name.
163    pub model_name: String,
164    /// Process ID (if running on local system).
165    pub pid: Option<u32>,
166    /// Port the server is listening on.
167    pub port: u16,
168    /// Unix timestamp (seconds) when the server was started.
169    pub started_at: u64,
170}
171
172impl ProcessHandle {
173    /// Create a new process handle.
174    #[must_use]
175    pub const fn new(
176        model_id: i64,
177        model_name: String,
178        pid: Option<u32>,
179        port: u16,
180        started_at: u64,
181    ) -> Self {
182        Self {
183            model_id,
184            model_name,
185            pid,
186            port,
187            started_at,
188        }
189    }
190}
191
192/// Health status of a running server.
193#[derive(Debug, Clone, Serialize, Deserialize)]
194pub struct ServerHealth {
195    /// Whether the server is responding to health checks.
196    pub healthy: bool,
197    /// Unix timestamp (seconds) of the last successful health check.
198    pub last_check: Option<u64>,
199    /// Context size being used by the server.
200    pub context_size: Option<u64>,
201    /// Optional status message.
202    pub message: Option<String>,
203}
204
205impl ServerHealth {
206    /// Get the current Unix timestamp in seconds.
207    fn now_secs() -> u64 {
208        std::time::SystemTime::now()
209            .duration_since(std::time::UNIX_EPOCH)
210            .unwrap()
211            .as_secs()
212    }
213
214    /// Create a healthy server status.
215    #[must_use]
216    pub fn healthy() -> Self {
217        Self {
218            healthy: true,
219            last_check: Some(Self::now_secs()),
220            context_size: None,
221            message: None,
222        }
223    }
224
225    /// Create an unhealthy server status with a message.
226    pub fn unhealthy(message: impl Into<String>) -> Self {
227        Self {
228            healthy: false,
229            last_check: Some(Self::now_secs()),
230            context_size: None,
231            message: Some(message.into()),
232        }
233    }
234
235    /// Set the context size.
236    #[must_use]
237    pub const fn with_context_size(mut self, size: u64) -> Self {
238        self.context_size = Some(size);
239        self
240    }
241}
242
243/// Process runner for managing model server processes.
244///
245/// This trait abstracts process management for testability and
246/// potential alternative backends (local, remote, containerized).
247///
248/// # Design Rules
249///
250/// - Express **intent**, not implementation detail
251/// - No CLI/Tauri/Axum concerns in signatures
252/// - Must support: mock runner, remote runner, alternative inference backends
253#[async_trait]
254pub trait ProcessRunner: Send + Sync {
255    /// Start a model server with the given configuration.
256    ///
257    /// Returns a handle that can be used to manage the process.
258    async fn start(&self, config: ServerConfig) -> Result<ProcessHandle, ProcessError>;
259
260    /// Stop a running server.
261    ///
262    /// Returns `Err(ProcessError::NotRunning)` if the process isn't running.
263    async fn stop(&self, handle: &ProcessHandle) -> Result<(), ProcessError>;
264
265    /// Check if a server is still running.
266    async fn is_running(&self, handle: &ProcessHandle) -> bool;
267
268    /// Get the health status of a running server.
269    ///
270    /// Returns `Err(ProcessError::NotRunning)` if the process isn't running.
271    async fn health(&self, handle: &ProcessHandle) -> Result<ServerHealth, ProcessError>;
272
273    /// List all currently running server processes.
274    ///
275    /// This is needed for snapshot behavior (e.g., `server:snapshot` events).
276    async fn list_running(&self) -> Result<Vec<ProcessHandle>, ProcessError>;
277}