gglib_core/domain/
inference.rs

1//! Inference configuration types.
2//!
3//! Defines shared types for configuring LLM inference parameters
4//! (temperature, `top_p`, `top_k`, `max_tokens`, `repeat_penalty`,
5//! `presence_penalty`, `min_p`).
6//!
7//! This module provides the core `InferenceConfig` type that is reused across:
8//! - Per-model defaults (`Model.inference_defaults`)
9//! - Global settings (`Settings.inference_defaults`)
10//! - Request-level overrides (flattened in `ChatProxyRequest`)
11
12use serde::{Deserialize, Serialize};
13
14/// Inference parameters for LLM sampling.
15///
16/// All fields are optional to support partial configuration and fallback chains.
17/// Intended to be shared across model defaults, global settings, and request overrides.
18///
19/// # Hierarchy Resolution
20///
21/// When making an inference request, parameters are resolved in this order:
22/// 1. Request-level override (user specified for this request)
23/// 2. Per-model defaults (stored in `Model.inference_defaults`)
24/// 3. Global settings (stored in `Settings.inference_defaults`)
25/// 4. Hardcoded fallback (e.g., temperature = 0.7)
26///
27/// # Examples
28///
29/// ```rust
30/// use gglib_core::domain::InferenceConfig;
31///
32/// // Conservative settings for code generation
33/// let code_gen = InferenceConfig {
34///     temperature: Some(0.2),
35///     top_p: Some(0.9),
36///     top_k: Some(40),
37///     max_tokens: Some(2048),
38///     repeat_penalty: Some(1.1),
39///     presence_penalty: None,
40///     min_p: None,
41/// };
42///
43/// // Creative writing settings
44/// let creative = InferenceConfig {
45///     temperature: Some(1.2),
46///     top_p: Some(0.95),
47///     ..Default::default()
48/// };
49/// ```
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
51#[serde(rename_all = "camelCase")]
52pub struct InferenceConfig {
53    /// Sampling temperature (0.0 - 2.0).
54    ///
55    /// Controls randomness in token selection:
56    /// - Lower values (0.1-0.5): More deterministic, focused
57    /// - Medium values (0.7-1.0): Balanced creativity
58    /// - Higher values (1.1-2.0): More random, creative
59    pub temperature: Option<f32>,
60
61    /// Nucleus sampling threshold (0.0 - 1.0).
62    ///
63    /// Considers only the top tokens whose cumulative probability exceeds this threshold.
64    /// Common values: 0.9 (default), 0.95 (more diverse)
65    pub top_p: Option<f32>,
66
67    /// Top-K sampling limit.
68    ///
69    /// Considers only the K most likely next tokens.
70    /// Common values: 40 (default), 10 (focused), 100 (diverse)
71    pub top_k: Option<i32>,
72
73    /// Maximum tokens to generate in response.
74    ///
75    /// Hard limit on response length. Does not include input tokens.
76    pub max_tokens: Option<u32>,
77
78    /// Repetition penalty (> 0.0, typically 1.0 - 1.3).
79    ///
80    /// Penalizes repeated tokens to reduce repetitive output.
81    /// - 1.0: No penalty (default)
82    /// - 1.1-1.3: Moderate penalty
83    /// - > 1.3: Strong penalty (may hurt coherence)
84    pub repeat_penalty: Option<f32>,
85
86    /// Presence penalty (0.0 - 2.0).
87    ///
88    /// Penalizes tokens that have already appeared in the output, encouraging
89    /// the model to cover new ground. Effective at preventing repetitive
90    /// reasoning loops in thinking models.
91    /// - 0.0: No penalty (default; disabled)
92    /// - 1.5: Recommended for reasoning/thinking models (e.g. `Qwen3.6`, `DeepSeek-R1`)
93    /// - > 2.0: Avoid; may degrade coherence
94    pub presence_penalty: Option<f32>,
95
96    /// Minimum-probability sampling threshold (0.0 - 1.0).
97    ///
98    /// Removes tokens whose probability is below `min_p × P(top token)`.
99    /// - 0.0: Disabled (explicit off; recommended by Qwen3.6)
100    /// - 0.05: llama.cpp built-in default when the flag is omitted
101    pub min_p: Option<f32>,
102}
103
104impl InferenceConfig {
105    /// Merge another config into this one, preferring values from `other`.
106    ///
107    /// For each field, if `other` has Some(value), use it; otherwise keep self's value.
108    /// This is useful for applying fallback chains.
109    ///
110    /// # Example
111    ///
112    /// ```rust
113    /// use gglib_core::domain::InferenceConfig;
114    ///
115    /// let mut request = InferenceConfig {
116    ///     temperature: Some(0.8),
117    ///     ..Default::default()
118    /// };
119    ///
120    /// let model_defaults = InferenceConfig {
121    ///     temperature: Some(0.5),
122    ///     top_p: Some(0.9),
123    ///     ..Default::default()
124    /// };
125    ///
126    /// request.merge_with(&model_defaults);
127    /// assert_eq!(request.temperature, Some(0.8)); // Request value wins
128    /// assert_eq!(request.top_p, Some(0.9));      // Fallback to model default
129    /// ```
130    pub const fn merge_with(&mut self, other: &Self) {
131        if self.temperature.is_none() {
132            self.temperature = other.temperature;
133        }
134        if self.top_p.is_none() {
135            self.top_p = other.top_p;
136        }
137        if self.top_k.is_none() {
138            self.top_k = other.top_k;
139        }
140        if self.max_tokens.is_none() {
141            self.max_tokens = other.max_tokens;
142        }
143        if self.repeat_penalty.is_none() {
144            self.repeat_penalty = other.repeat_penalty;
145        }
146        if self.presence_penalty.is_none() {
147            self.presence_penalty = other.presence_penalty;
148        }
149        if self.min_p.is_none() {
150            self.min_p = other.min_p;
151        }
152    }
153
154    /// Create a new config with all fields set to sensible defaults.
155    ///
156    /// These are the hardcoded fallback values used when no other
157    /// defaults are configured.
158    #[must_use]
159    pub const fn with_hardcoded_defaults() -> Self {
160        Self {
161            temperature: Some(0.7),
162            top_p: Some(0.95),
163            top_k: Some(40),
164            max_tokens: Some(2048),
165            repeat_penalty: Some(1.0),
166            presence_penalty: Some(0.0),
167            min_p: Some(0.0),
168        }
169    }
170
171    /// Convert inference config to llama CLI arguments.
172    ///
173    /// Returns a vector of argument strings suitable for passing to llama-server.
174    /// Uses the same flag names as llama.cpp: `--temp`, `--top-p`, `--top-k`, `-n`, `--repeat-penalty`.
175    ///
176    /// This is the single source of truth for CLI flag conversion, used by:
177    /// - `LlamaCommandBuilder` (for CLI commands)
178    /// - GUI server startup (via `ServerConfig.extra_args`)
179    ///
180    /// # Example
181    ///
182    /// ```rust
183    /// use gglib_core::domain::InferenceConfig;
184    ///
185    /// let config = InferenceConfig {
186    ///     temperature: Some(0.8),
187    ///     top_p: Some(0.9),
188    ///     top_k: None,
189    ///     max_tokens: Some(1024),
190    ///     repeat_penalty: None,
191    ///     presence_penalty: None,
192    ///     min_p: None,
193    /// };
194    ///
195    /// let args = config.to_cli_args();
196    /// assert_eq!(args, vec!["--temp", "0.8", "--top-p", "0.9", "-n", "1024"]);
197    /// ```
198    #[must_use]
199    pub fn to_cli_args(&self) -> Vec<String> {
200        let mut args = Vec::new();
201
202        if let Some(temp) = self.temperature {
203            args.push("--temp".to_string());
204            args.push(temp.to_string());
205        }
206        if let Some(top_p) = self.top_p {
207            args.push("--top-p".to_string());
208            args.push(top_p.to_string());
209        }
210        if let Some(top_k) = self.top_k {
211            args.push("--top-k".to_string());
212            args.push(top_k.to_string());
213        }
214        if let Some(max_tokens) = self.max_tokens {
215            args.push("-n".to_string());
216            args.push(max_tokens.to_string());
217        }
218        if let Some(repeat_penalty) = self.repeat_penalty {
219            args.push("--repeat-penalty".to_string());
220            args.push(repeat_penalty.to_string());
221        }
222        if let Some(presence_penalty) = self.presence_penalty {
223            args.push("--presence-penalty".to_string());
224            args.push(presence_penalty.to_string());
225        }
226        if let Some(min_p) = self.min_p {
227            args.push("--min-p".to_string());
228            args.push(min_p.to_string());
229        }
230
231        args
232    }
233
234    /// Return a recommended [`InferenceConfig`] profile for reasoning / thinking models.
235    ///
236    /// Applied automatically at import time when the `"reasoning"` capability tag is
237    /// detected (e.g. Qwen3.6, `DeepSeek-R1`, `QwQ`). Values follow the Qwen3.6 upstream
238    /// guidance for **thinking mode — general tasks** and are conservative enough to
239    /// work well across all thinking-capable models.
240    ///
241    /// | Parameter | Value | Rationale |
242    /// |-----------|-------|-----------|
243    /// | `temperature` | 1.0 | Recommended thinking-mode baseline |
244    /// | `top_p` | 0.95 | Broad nucleus; standard for reasoning |
245    /// | `top_k` | 20 | Tighter than the 40 fallback; suppresses low-quality tokens |
246    /// | `max_tokens` | 8192 | Safe out-of-the-box ceiling; increase for complex tasks |
247    /// | `repeat_penalty` | 1.0 | No penalty; `presence_penalty` handles anti-repetition |
248    /// | `presence_penalty` | 1.5 | Prevents repetitive reasoning loops |
249    /// | `min_p` | 0.0 | Explicitly disabled per Qwen3.6 spec |
250    ///
251    /// Users can override any parameter with `gglib model update <id> --<flag>` or
252    /// the equivalent UI control.
253    #[must_use]
254    pub const fn reasoning_profile() -> Self {
255        Self {
256            temperature: Some(1.0),
257            top_p: Some(0.95),
258            top_k: Some(20),
259            max_tokens: Some(8192),
260            repeat_penalty: Some(1.0),
261            presence_penalty: Some(1.5),
262            min_p: Some(0.0),
263        }
264    }
265}
266
267#[cfg(test)]
268mod tests {
269    use super::*;
270
271    #[test]
272    fn test_default_is_all_none() {
273        let config = InferenceConfig::default();
274        assert!(config.temperature.is_none());
275        assert!(config.top_p.is_none());
276        assert!(config.top_k.is_none());
277        assert!(config.max_tokens.is_none());
278        assert!(config.repeat_penalty.is_none());
279        assert!(config.presence_penalty.is_none());
280        assert!(config.min_p.is_none());
281    }
282
283    #[test]
284    fn test_merge_with_prefers_self() {
285        let mut request = InferenceConfig {
286            temperature: Some(0.8),
287            top_p: None,
288            ..Default::default()
289        };
290
291        let model_defaults = InferenceConfig {
292            temperature: Some(0.5),
293            top_p: Some(0.9),
294            top_k: Some(50),
295            ..Default::default()
296        };
297
298        request.merge_with(&model_defaults);
299
300        assert_eq!(request.temperature, Some(0.8)); // Request wins
301        assert_eq!(request.top_p, Some(0.9)); // Fallback to model
302        assert_eq!(request.top_k, Some(50)); // Fallback to model
303        assert!(request.max_tokens.is_none()); // Still None
304    }
305
306    #[test]
307    fn test_hardcoded_defaults() {
308        let config = InferenceConfig::with_hardcoded_defaults();
309        assert_eq!(config.temperature, Some(0.7));
310        assert_eq!(config.top_p, Some(0.95));
311        assert_eq!(config.top_k, Some(40));
312        assert_eq!(config.max_tokens, Some(2048));
313        assert_eq!(config.repeat_penalty, Some(1.0));
314        assert_eq!(config.presence_penalty, Some(0.0));
315        assert_eq!(config.min_p, Some(0.0));
316    }
317
318    #[test]
319    fn test_reasoning_profile() {
320        let profile = InferenceConfig::reasoning_profile();
321        assert_eq!(profile.temperature, Some(1.0));
322        assert_eq!(profile.top_p, Some(0.95));
323        assert_eq!(profile.top_k, Some(20));
324        assert_eq!(profile.max_tokens, Some(8192));
325        assert_eq!(profile.repeat_penalty, Some(1.0));
326        assert_eq!(profile.presence_penalty, Some(1.5));
327        assert_eq!(profile.min_p, Some(0.0));
328    }
329
330    #[test]
331    fn test_serialization() {
332        let config = InferenceConfig {
333            temperature: Some(0.7),
334            top_p: Some(0.9),
335            top_k: None,
336            max_tokens: Some(1024),
337            repeat_penalty: None,
338            presence_penalty: None,
339            min_p: None,
340        };
341
342        let json = serde_json::to_string(&config).unwrap();
343        let deserialized: InferenceConfig = serde_json::from_str(&json).unwrap();
344
345        assert_eq!(config, deserialized);
346    }
347}