gglib_core/domain/inference.rs
1//! Inference configuration types.
2//!
3//! Defines shared types for configuring LLM inference parameters
4//! (temperature, `top_p`, `top_k`, `max_tokens`, `repeat_penalty`,
5//! `presence_penalty`, `min_p`).
6//!
7//! This module provides the core `InferenceConfig` type that is reused across:
8//! - Per-model defaults (`Model.inference_defaults`)
9//! - Global settings (`Settings.inference_defaults`)
10//! - Request-level overrides (flattened in `ChatProxyRequest`)
11
12use serde::{Deserialize, Serialize};
13
14/// Inference parameters for LLM sampling.
15///
16/// All fields are optional to support partial configuration and fallback chains.
17/// Intended to be shared across model defaults, global settings, and request overrides.
18///
19/// # Hierarchy Resolution
20///
21/// When making an inference request, parameters are resolved in this order:
22/// 1. Request-level override (user specified for this request)
23/// 2. Per-model defaults (stored in `Model.inference_defaults`)
24/// 3. Global settings (stored in `Settings.inference_defaults`)
25/// 4. Hardcoded fallback (e.g., temperature = 0.7)
26///
27/// # Examples
28///
29/// ```rust
30/// use gglib_core::domain::InferenceConfig;
31///
32/// // Conservative settings for code generation
33/// let code_gen = InferenceConfig {
34/// temperature: Some(0.2),
35/// top_p: Some(0.9),
36/// top_k: Some(40),
37/// max_tokens: Some(2048),
38/// repeat_penalty: Some(1.1),
39/// presence_penalty: None,
40/// min_p: None,
41/// };
42///
43/// // Creative writing settings
44/// let creative = InferenceConfig {
45/// temperature: Some(1.2),
46/// top_p: Some(0.95),
47/// ..Default::default()
48/// };
49/// ```
50#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
51#[serde(rename_all = "camelCase")]
52pub struct InferenceConfig {
53 /// Sampling temperature (0.0 - 2.0).
54 ///
55 /// Controls randomness in token selection:
56 /// - Lower values (0.1-0.5): More deterministic, focused
57 /// - Medium values (0.7-1.0): Balanced creativity
58 /// - Higher values (1.1-2.0): More random, creative
59 pub temperature: Option<f32>,
60
61 /// Nucleus sampling threshold (0.0 - 1.0).
62 ///
63 /// Considers only the top tokens whose cumulative probability exceeds this threshold.
64 /// Common values: 0.9 (default), 0.95 (more diverse)
65 pub top_p: Option<f32>,
66
67 /// Top-K sampling limit.
68 ///
69 /// Considers only the K most likely next tokens.
70 /// Common values: 40 (default), 10 (focused), 100 (diverse)
71 pub top_k: Option<i32>,
72
73 /// Maximum tokens to generate in response.
74 ///
75 /// Hard limit on response length. Does not include input tokens.
76 pub max_tokens: Option<u32>,
77
78 /// Repetition penalty (> 0.0, typically 1.0 - 1.3).
79 ///
80 /// Penalizes repeated tokens to reduce repetitive output.
81 /// - 1.0: No penalty (default)
82 /// - 1.1-1.3: Moderate penalty
83 /// - > 1.3: Strong penalty (may hurt coherence)
84 pub repeat_penalty: Option<f32>,
85
86 /// Presence penalty (0.0 - 2.0).
87 ///
88 /// Penalizes tokens that have already appeared in the output, encouraging
89 /// the model to cover new ground. Effective at preventing repetitive
90 /// reasoning loops in thinking models.
91 /// - 0.0: No penalty (default; disabled)
92 /// - 1.5: Recommended for reasoning/thinking models (e.g. `Qwen3.6`, `DeepSeek-R1`)
93 /// - > 2.0: Avoid; may degrade coherence
94 pub presence_penalty: Option<f32>,
95
96 /// Minimum-probability sampling threshold (0.0 - 1.0).
97 ///
98 /// Removes tokens whose probability is below `min_p × P(top token)`.
99 /// - 0.0: Disabled (explicit off; recommended by Qwen3.6)
100 /// - 0.05: llama.cpp built-in default when the flag is omitted
101 pub min_p: Option<f32>,
102}
103
104impl InferenceConfig {
105 /// Merge another config into this one, preferring values from `other`.
106 ///
107 /// For each field, if `other` has Some(value), use it; otherwise keep self's value.
108 /// This is useful for applying fallback chains.
109 ///
110 /// # Example
111 ///
112 /// ```rust
113 /// use gglib_core::domain::InferenceConfig;
114 ///
115 /// let mut request = InferenceConfig {
116 /// temperature: Some(0.8),
117 /// ..Default::default()
118 /// };
119 ///
120 /// let model_defaults = InferenceConfig {
121 /// temperature: Some(0.5),
122 /// top_p: Some(0.9),
123 /// ..Default::default()
124 /// };
125 ///
126 /// request.merge_with(&model_defaults);
127 /// assert_eq!(request.temperature, Some(0.8)); // Request value wins
128 /// assert_eq!(request.top_p, Some(0.9)); // Fallback to model default
129 /// ```
130 pub const fn merge_with(&mut self, other: &Self) {
131 if self.temperature.is_none() {
132 self.temperature = other.temperature;
133 }
134 if self.top_p.is_none() {
135 self.top_p = other.top_p;
136 }
137 if self.top_k.is_none() {
138 self.top_k = other.top_k;
139 }
140 if self.max_tokens.is_none() {
141 self.max_tokens = other.max_tokens;
142 }
143 if self.repeat_penalty.is_none() {
144 self.repeat_penalty = other.repeat_penalty;
145 }
146 if self.presence_penalty.is_none() {
147 self.presence_penalty = other.presence_penalty;
148 }
149 if self.min_p.is_none() {
150 self.min_p = other.min_p;
151 }
152 }
153
154 /// Create a new config with all fields set to sensible defaults.
155 ///
156 /// These are the hardcoded fallback values used when no other
157 /// defaults are configured.
158 #[must_use]
159 pub const fn with_hardcoded_defaults() -> Self {
160 Self {
161 temperature: Some(0.7),
162 top_p: Some(0.95),
163 top_k: Some(40),
164 max_tokens: Some(2048),
165 repeat_penalty: Some(1.0),
166 presence_penalty: Some(0.0),
167 min_p: Some(0.0),
168 }
169 }
170
171 /// Convert inference config to llama CLI arguments.
172 ///
173 /// Returns a vector of argument strings suitable for passing to llama-server.
174 /// Uses the same flag names as llama.cpp: `--temp`, `--top-p`, `--top-k`, `-n`, `--repeat-penalty`.
175 ///
176 /// This is the single source of truth for CLI flag conversion, used by:
177 /// - `LlamaCommandBuilder` (for CLI commands)
178 /// - GUI server startup (via `ServerConfig.extra_args`)
179 ///
180 /// # Example
181 ///
182 /// ```rust
183 /// use gglib_core::domain::InferenceConfig;
184 ///
185 /// let config = InferenceConfig {
186 /// temperature: Some(0.8),
187 /// top_p: Some(0.9),
188 /// top_k: None,
189 /// max_tokens: Some(1024),
190 /// repeat_penalty: None,
191 /// presence_penalty: None,
192 /// min_p: None,
193 /// };
194 ///
195 /// let args = config.to_cli_args();
196 /// assert_eq!(args, vec!["--temp", "0.8", "--top-p", "0.9", "-n", "1024"]);
197 /// ```
198 #[must_use]
199 pub fn to_cli_args(&self) -> Vec<String> {
200 let mut args = Vec::new();
201
202 if let Some(temp) = self.temperature {
203 args.push("--temp".to_string());
204 args.push(temp.to_string());
205 }
206 if let Some(top_p) = self.top_p {
207 args.push("--top-p".to_string());
208 args.push(top_p.to_string());
209 }
210 if let Some(top_k) = self.top_k {
211 args.push("--top-k".to_string());
212 args.push(top_k.to_string());
213 }
214 if let Some(max_tokens) = self.max_tokens {
215 args.push("-n".to_string());
216 args.push(max_tokens.to_string());
217 }
218 if let Some(repeat_penalty) = self.repeat_penalty {
219 args.push("--repeat-penalty".to_string());
220 args.push(repeat_penalty.to_string());
221 }
222 if let Some(presence_penalty) = self.presence_penalty {
223 args.push("--presence-penalty".to_string());
224 args.push(presence_penalty.to_string());
225 }
226 if let Some(min_p) = self.min_p {
227 args.push("--min-p".to_string());
228 args.push(min_p.to_string());
229 }
230
231 args
232 }
233
234 /// Return a recommended [`InferenceConfig`] profile for reasoning / thinking models.
235 ///
236 /// Applied automatically at import time when the `"reasoning"` capability tag is
237 /// detected (e.g. Qwen3.6, `DeepSeek-R1`, `QwQ`). Values follow the Qwen3.6 upstream
238 /// guidance for **thinking mode — general tasks** and are conservative enough to
239 /// work well across all thinking-capable models.
240 ///
241 /// | Parameter | Value | Rationale |
242 /// |-----------|-------|-----------|
243 /// | `temperature` | 1.0 | Recommended thinking-mode baseline |
244 /// | `top_p` | 0.95 | Broad nucleus; standard for reasoning |
245 /// | `top_k` | 20 | Tighter than the 40 fallback; suppresses low-quality tokens |
246 /// | `max_tokens` | 8192 | Safe out-of-the-box ceiling; increase for complex tasks |
247 /// | `repeat_penalty` | 1.0 | No penalty; `presence_penalty` handles anti-repetition |
248 /// | `presence_penalty` | 1.5 | Prevents repetitive reasoning loops |
249 /// | `min_p` | 0.0 | Explicitly disabled per Qwen3.6 spec |
250 ///
251 /// Users can override any parameter with `gglib model update <id> --<flag>` or
252 /// the equivalent UI control.
253 #[must_use]
254 pub const fn reasoning_profile() -> Self {
255 Self {
256 temperature: Some(1.0),
257 top_p: Some(0.95),
258 top_k: Some(20),
259 max_tokens: Some(8192),
260 repeat_penalty: Some(1.0),
261 presence_penalty: Some(1.5),
262 min_p: Some(0.0),
263 }
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270
271 #[test]
272 fn test_default_is_all_none() {
273 let config = InferenceConfig::default();
274 assert!(config.temperature.is_none());
275 assert!(config.top_p.is_none());
276 assert!(config.top_k.is_none());
277 assert!(config.max_tokens.is_none());
278 assert!(config.repeat_penalty.is_none());
279 assert!(config.presence_penalty.is_none());
280 assert!(config.min_p.is_none());
281 }
282
283 #[test]
284 fn test_merge_with_prefers_self() {
285 let mut request = InferenceConfig {
286 temperature: Some(0.8),
287 top_p: None,
288 ..Default::default()
289 };
290
291 let model_defaults = InferenceConfig {
292 temperature: Some(0.5),
293 top_p: Some(0.9),
294 top_k: Some(50),
295 ..Default::default()
296 };
297
298 request.merge_with(&model_defaults);
299
300 assert_eq!(request.temperature, Some(0.8)); // Request wins
301 assert_eq!(request.top_p, Some(0.9)); // Fallback to model
302 assert_eq!(request.top_k, Some(50)); // Fallback to model
303 assert!(request.max_tokens.is_none()); // Still None
304 }
305
306 #[test]
307 fn test_hardcoded_defaults() {
308 let config = InferenceConfig::with_hardcoded_defaults();
309 assert_eq!(config.temperature, Some(0.7));
310 assert_eq!(config.top_p, Some(0.95));
311 assert_eq!(config.top_k, Some(40));
312 assert_eq!(config.max_tokens, Some(2048));
313 assert_eq!(config.repeat_penalty, Some(1.0));
314 assert_eq!(config.presence_penalty, Some(0.0));
315 assert_eq!(config.min_p, Some(0.0));
316 }
317
318 #[test]
319 fn test_reasoning_profile() {
320 let profile = InferenceConfig::reasoning_profile();
321 assert_eq!(profile.temperature, Some(1.0));
322 assert_eq!(profile.top_p, Some(0.95));
323 assert_eq!(profile.top_k, Some(20));
324 assert_eq!(profile.max_tokens, Some(8192));
325 assert_eq!(profile.repeat_penalty, Some(1.0));
326 assert_eq!(profile.presence_penalty, Some(1.5));
327 assert_eq!(profile.min_p, Some(0.0));
328 }
329
330 #[test]
331 fn test_serialization() {
332 let config = InferenceConfig {
333 temperature: Some(0.7),
334 top_p: Some(0.9),
335 top_k: None,
336 max_tokens: Some(1024),
337 repeat_penalty: None,
338 presence_penalty: None,
339 min_p: None,
340 };
341
342 let json = serde_json::to_string(&config).unwrap();
343 let deserialized: InferenceConfig = serde_json::from_str(&json).unwrap();
344
345 assert_eq!(config, deserialized);
346 }
347}