gglib_core/normalize/parsers/
qwen_xml.rs1use serde_json::Value;
25
26use super::super::error::NormalizationError;
27use super::super::parser::{ParserOutput, ToolCallParser};
28use crate::domain::agent::ToolCall;
29
30const OPEN: &str = "<tool_call>";
32const CLOSE: &str = "</tool_call>";
34
35#[derive(Default, Debug)]
38struct ChannelState {
39 pending: String,
41 inside: bool,
43 body: String,
45}
46
47#[derive(Copy, Clone)]
49enum Channel {
50 Text,
51 Reasoning,
52}
53
54#[derive(Default, Debug)]
56pub struct QwenXmlParser {
57 text: ChannelState,
58 reasoning: ChannelState,
59 next_id: u32,
62}
63
64impl QwenXmlParser {
65 #[must_use]
67 pub fn new() -> Self {
68 Self::default()
69 }
70
71 fn mint_id(&mut self) -> String {
73 let n = self.next_id;
74 self.next_id = self.next_id.saturating_add(1);
75 format!("call_qwen_{n}")
76 }
77
78 fn scan(&mut self, channel: Channel, chunk: &str) -> ParserOutput {
84 let mut out = ParserOutput::default();
85
86 let mut state = match channel {
90 Channel::Text => std::mem::take(&mut self.text),
91 Channel::Reasoning => std::mem::take(&mut self.reasoning),
92 };
93
94 state.pending.push_str(chunk);
95
96 loop {
97 if state.inside {
98 if let Some(p) = state.pending.find(CLOSE) {
99 state.body.push_str(&state.pending[..p]);
100 finalize_tool_call(&state.body, &mut out, || self.mint_id());
101 state.body.clear();
102 state.inside = false;
103 state.pending.drain(..p + CLOSE.len());
104 continue;
105 }
106 let keep = partial_suffix_len(state.pending.as_bytes(), CLOSE.as_bytes());
107 let flush_to = state.pending.len() - keep;
108 state.body.push_str(&state.pending[..flush_to]);
109 state.pending.drain(..flush_to);
110 break;
111 }
112
113 if let Some(p) = state.pending.find(OPEN) {
115 forward(&mut out, channel, &state.pending[..p]);
116 state.pending.drain(..p + OPEN.len());
117 state.inside = true;
118 continue;
119 }
120 let keep = partial_suffix_len(state.pending.as_bytes(), OPEN.as_bytes());
121 let flush_to = state.pending.len() - keep;
122 forward(&mut out, channel, &state.pending[..flush_to]);
123 state.pending.drain(..flush_to);
124 break;
125 }
126
127 match channel {
128 Channel::Text => self.text = state,
129 Channel::Reasoning => self.reasoning = state,
130 }
131 out
132 }
133
134 fn flush_channel(&mut self, channel: Channel) -> ParserOutput {
136 let mut out = ParserOutput::default();
137 let state = match channel {
138 Channel::Text => std::mem::take(&mut self.text),
139 Channel::Reasoning => std::mem::take(&mut self.reasoning),
140 };
141 if state.inside {
142 let mut partial = state.body;
146 partial.push_str(&state.pending);
147 out.errors
148 .push(NormalizationError::unclosed_tool_call(partial));
149 } else {
150 forward(&mut out, channel, &state.pending);
152 }
153 out
154 }
155}
156
157impl ToolCallParser for QwenXmlParser {
158 fn push_text(&mut self, chunk: &str) -> ParserOutput {
159 self.scan(Channel::Text, chunk)
160 }
161
162 fn push_reasoning(&mut self, chunk: &str) -> ParserOutput {
163 self.scan(Channel::Reasoning, chunk)
164 }
165
166 fn finish(&mut self) -> ParserOutput {
167 let mut a = self.flush_channel(Channel::Text);
168 let b = self.flush_channel(Channel::Reasoning);
169 a.forward_text.push_str(&b.forward_text);
170 a.forward_reasoning.push_str(&b.forward_reasoning);
171 a.tool_calls.extend(b.tool_calls);
172 a.errors.extend(b.errors);
173 a
174 }
175}
176
177fn forward(out: &mut ParserOutput, channel: Channel, bytes: &str) {
183 if bytes.is_empty() {
184 return;
185 }
186 match channel {
187 Channel::Text => out.forward_text.push_str(bytes),
188 Channel::Reasoning => out.forward_reasoning.push_str(bytes),
189 }
190}
191
192fn finalize_tool_call(body: &str, out: &mut ParserOutput, mut mint_id: impl FnMut() -> String) {
204 let trimmed = body.trim();
205 if let Some(call) = parse_json_body(trimmed, &mut mint_id) {
206 out.tool_calls.push(call);
207 return;
208 }
209 if let Some(call) = parse_function_xml_body(trimmed, &mut mint_id) {
210 out.tool_calls.push(call);
211 return;
212 }
213 out.errors
214 .push(NormalizationError::malformed_tool_call(body.to_owned()));
215}
216
217fn parse_json_body(body: &str, mint_id: &mut impl FnMut() -> String) -> Option<ToolCall> {
219 let parsed: Value = serde_json::from_str(body).ok()?;
220 let obj = parsed.as_object()?;
221 let name = obj.get("name").and_then(Value::as_str)?.to_owned();
222 let arguments = obj
223 .get("arguments")
224 .cloned()
225 .unwrap_or_else(|| Value::Object(serde_json::Map::new()));
226 Some(ToolCall {
227 id: mint_id(),
228 name,
229 arguments,
230 })
231}
232
233fn parse_function_xml_body(body: &str, mint_id: &mut impl FnMut() -> String) -> Option<ToolCall> {
241 let body = body.trim();
242 let after_open = body.strip_prefix("<function=")?;
243 let name_end = after_open.find('>')?;
244 let name = after_open[..name_end].trim();
245 if name.is_empty() {
246 return None;
247 }
248 let inner = &after_open[name_end + 1..];
249 let inner = inner.strip_suffix("</function>")?.trim();
250
251 let mut args = serde_json::Map::new();
252 let mut cursor = inner;
253 while !cursor.is_empty() {
254 cursor = cursor.trim_start();
255 if cursor.is_empty() {
256 break;
257 }
258 let after_param = cursor.strip_prefix("<parameter=")?;
259 let key_end = after_param.find('>')?;
260 let key = after_param[..key_end].trim().to_owned();
261 if key.is_empty() {
262 return None;
263 }
264 let rest = &after_param[key_end + 1..];
265 let close_at = rest.find("</parameter>")?;
266 let raw_value = rest[..close_at].trim();
267 let value = parse_param_value(raw_value);
268 args.insert(key, value);
269 cursor = &rest[close_at + "</parameter>".len()..];
270 }
271
272 Some(ToolCall {
273 id: mint_id(),
274 name: name.to_owned(),
275 arguments: Value::Object(args),
276 })
277}
278
279fn parse_param_value(raw: &str) -> Value {
282 if raw.is_empty() {
283 return Value::String(String::new());
284 }
285 if let Ok(v) = serde_json::from_str::<Value>(raw) {
286 return v;
287 }
288 Value::String(raw.to_owned())
289}
290
291fn partial_suffix_len(buf: &[u8], marker: &[u8]) -> usize {
295 if marker.len() < 2 {
296 return 0;
297 }
298 let max = std::cmp::min(buf.len(), marker.len() - 1);
299 for n in (1..=max).rev() {
300 if buf.ends_with(&marker[..n]) {
301 return n;
302 }
303 }
304 0
305}
306
307#[cfg(test)]
312mod tests {
313 use super::*;
314 use serde_json::json;
315
316 fn collect(p: &mut QwenXmlParser, chunks: &[&str]) -> ParserOutput {
317 let mut total = ParserOutput::default();
318 for c in chunks {
319 let o = p.push_text(c);
320 total.forward_text.push_str(&o.forward_text);
321 total.forward_reasoning.push_str(&o.forward_reasoning);
322 total.tool_calls.extend(o.tool_calls);
323 total.errors.extend(o.errors);
324 }
325 let f = p.finish();
326 total.forward_text.push_str(&f.forward_text);
327 total.forward_reasoning.push_str(&f.forward_reasoning);
328 total.tool_calls.extend(f.tool_calls);
329 total.errors.extend(f.errors);
330 total
331 }
332
333 #[test]
334 fn passthrough_with_no_markup() {
335 let mut p = QwenXmlParser::new();
336 let out = collect(&mut p, &["hello ", "world"]);
337 assert_eq!(out.forward_text, "hello world");
338 assert!(out.tool_calls.is_empty());
339 assert!(out.errors.is_empty());
340 }
341
342 #[test]
343 fn extracts_simple_tool_call_from_text() {
344 let mut p = QwenXmlParser::new();
345 let out = collect(
346 &mut p,
347 &[r#"before<tool_call>{"name":"foo","arguments":{"x":1}}</tool_call>after"#],
348 );
349 assert_eq!(out.forward_text, "beforeafter");
350 assert_eq!(out.tool_calls.len(), 1);
351 assert_eq!(out.tool_calls[0].id, "call_qwen_0");
352 assert_eq!(out.tool_calls[0].name, "foo");
353 assert_eq!(out.tool_calls[0].arguments, json!({"x": 1}));
354 assert!(out.errors.is_empty());
355 }
356
357 #[test]
358 fn open_tag_straddles_chunk_boundary() {
359 let mut p = QwenXmlParser::new();
360 let out = collect(
361 &mut p,
362 &[
363 "before<tool",
364 "_call>",
365 r#"{"name":"foo","arguments":{}}"#,
366 "</tool_call>",
367 "after",
368 ],
369 );
370 assert_eq!(out.forward_text, "beforeafter");
371 assert_eq!(out.tool_calls.len(), 1);
372 assert_eq!(out.tool_calls[0].name, "foo");
373 }
374
375 #[test]
376 fn close_tag_straddles_chunk_boundary() {
377 let mut p = QwenXmlParser::new();
378 let out = collect(
379 &mut p,
380 &[
381 "<tool_call>",
382 r#"{"name":"foo","arguments":{}}</tool"#,
383 "_call>tail",
384 ],
385 );
386 assert_eq!(out.forward_text, "tail");
387 assert_eq!(out.tool_calls.len(), 1);
388 assert_eq!(out.tool_calls[0].name, "foo");
389 }
390
391 #[test]
392 fn one_byte_at_a_time_still_works() {
393 let mut p = QwenXmlParser::new();
394 let s = r#"x<tool_call>{"name":"f","arguments":{"a":2}}</tool_call>y"#;
395 let chunks: Vec<String> = s.chars().map(|c| c.to_string()).collect();
396 let refs: Vec<&str> = chunks.iter().map(String::as_str).collect();
397 let out = collect(&mut p, &refs);
398 assert_eq!(out.forward_text, "xy");
399 assert_eq!(out.tool_calls.len(), 1);
400 assert_eq!(out.tool_calls[0].arguments, json!({"a": 2}));
401 }
402
403 #[test]
404 fn tool_call_in_reasoning_channel_is_extracted() {
405 let mut p = QwenXmlParser::new();
406 let chunk = r#"thinking <tool_call>{"name":"foo","arguments":{}}</tool_call> done"#;
407 let out = p.push_reasoning(chunk);
408 let f = p.finish();
409 assert_eq!(out.forward_reasoning, "thinking done");
410 assert_eq!(out.tool_calls.len(), 1);
411 assert_eq!(out.tool_calls[0].name, "foo");
412 assert!(f.is_empty());
413 }
414
415 #[test]
416 fn malformed_json_emits_error() {
417 let mut p = QwenXmlParser::new();
418 let out = collect(&mut p, &["<tool_call>not json</tool_call>"]);
419 assert!(out.tool_calls.is_empty());
420 assert_eq!(out.errors.len(), 1);
421 assert!(matches!(
422 out.errors[0].kind,
423 crate::normalize::error::NormalizationErrorKind::MalformedToolCallJson { .. }
424 ));
425 }
426
427 #[test]
428 fn missing_name_field_is_malformed() {
429 let mut p = QwenXmlParser::new();
430 let out = collect(&mut p, &[r#"<tool_call>{"arguments":{}}</tool_call>"#]);
431 assert!(out.tool_calls.is_empty());
432 assert_eq!(out.errors.len(), 1);
433 }
434
435 #[test]
436 fn missing_arguments_defaults_to_empty_object() {
437 let mut p = QwenXmlParser::new();
438 let out = collect(&mut p, &[r#"<tool_call>{"name":"foo"}</tool_call>"#]);
439 assert_eq!(out.tool_calls.len(), 1);
440 assert_eq!(out.tool_calls[0].arguments, json!({}));
441 assert!(out.errors.is_empty());
442 }
443
444 #[test]
445 fn unclosed_tag_at_end_yields_error() {
446 let mut p = QwenXmlParser::new();
447 let _ = p.push_text(r#"hello <tool_call>{"name":"foo""#);
448 let f = p.finish();
449 assert_eq!(f.errors.len(), 1);
450 assert!(matches!(
451 f.errors[0].kind,
452 crate::normalize::error::NormalizationErrorKind::UnclosedToolCallTag { .. }
453 ));
454 assert!(f.tool_calls.is_empty());
455 }
456
457 #[test]
458 fn multiple_tool_calls_get_distinct_ids() {
459 let mut p = QwenXmlParser::new();
460 let out = collect(
461 &mut p,
462 &[
463 r#"<tool_call>{"name":"a","arguments":{}}</tool_call>"#,
464 r#"<tool_call>{"name":"b","arguments":{}}</tool_call>"#,
465 ],
466 );
467 assert_eq!(out.tool_calls.len(), 2);
468 assert_eq!(out.tool_calls[0].id, "call_qwen_0");
469 assert_eq!(out.tool_calls[1].id, "call_qwen_1");
470 }
471
472 #[test]
473 fn partial_marker_lookalike_is_eventually_flushed() {
474 let mut p = QwenXmlParser::new();
477 let mid = p.push_text("<tool");
478 assert_eq!(mid.forward_text, "");
479 let f = p.finish();
480 assert_eq!(f.forward_text, "<tool");
481 }
482
483 #[test]
484 fn partial_suffix_len_finds_longest_overlap() {
485 assert_eq!(partial_suffix_len(b"abc<tool", b"<tool_call>"), 5);
486 assert_eq!(partial_suffix_len(b"abc<", b"<tool_call>"), 1);
487 assert_eq!(partial_suffix_len(b"abc", b"<tool_call>"), 0);
488 assert_eq!(partial_suffix_len(b"<tool_call>", b"<tool_call>"), 0);
491 assert_eq!(partial_suffix_len(b"</tool_call><", b"<tool_call>"), 1);
493 }
494
495 #[test]
501 fn extracts_function_xml_body_with_string_param() {
502 let mut p = QwenXmlParser::new();
503 let body = "<tool_call>\n<function=grep>\n<parameter=regex>\ngglib\\s+q\n</parameter>\n</function>\n</tool_call>";
504 let out = collect(&mut p, &[body]);
505 assert!(out.errors.is_empty(), "errors: {:?}", out.errors);
506 assert_eq!(out.tool_calls.len(), 1);
507 assert_eq!(out.tool_calls[0].name, "grep");
508 assert_eq!(
509 out.tool_calls[0].arguments,
510 json!({ "regex": "gglib\\s+q" })
511 );
512 }
513
514 #[test]
515 fn function_xml_body_with_multiple_params() {
516 let mut p = QwenXmlParser::new();
517 let body = concat!(
518 "<tool_call><function=read_file>",
519 "<parameter=path>src/main.rs</parameter>",
520 "<parameter=start_line>1</parameter>",
521 "<parameter=end_line>20</parameter>",
522 "</function></tool_call>",
523 );
524 let out = collect(&mut p, &[body]);
525 assert!(out.errors.is_empty());
526 assert_eq!(out.tool_calls.len(), 1);
527 assert_eq!(out.tool_calls[0].name, "read_file");
528 assert_eq!(
529 out.tool_calls[0].arguments,
530 json!({ "path": "src/main.rs", "start_line": 1, "end_line": 20 })
531 );
532 }
533
534 #[test]
535 fn function_xml_body_with_json_object_param() {
536 let mut p = QwenXmlParser::new();
537 let body = r#"<tool_call><function=run><parameter=opts>{"a":1,"b":[2,3]}</parameter></function></tool_call>"#;
538 let out = collect(&mut p, &[body]);
539 assert!(out.errors.is_empty());
540 assert_eq!(out.tool_calls.len(), 1);
541 assert_eq!(
542 out.tool_calls[0].arguments,
543 json!({ "opts": { "a": 1, "b": [2, 3] } })
544 );
545 }
546
547 #[test]
548 fn function_xml_body_streamed_byte_by_byte() {
549 let mut p = QwenXmlParser::new();
550 let s = "<tool_call><function=grep><parameter=regex>x</parameter></function></tool_call>";
551 let chunks: Vec<String> = s.chars().map(|c| c.to_string()).collect();
552 let refs: Vec<&str> = chunks.iter().map(String::as_str).collect();
553 let out = collect(&mut p, &refs);
554 assert!(out.errors.is_empty());
555 assert_eq!(out.tool_calls.len(), 1);
556 assert_eq!(out.tool_calls[0].name, "grep");
557 assert_eq!(out.tool_calls[0].arguments, json!({ "regex": "x" }));
558 }
559
560 #[test]
561 fn function_xml_body_without_parameters_yields_empty_args() {
562 let mut p = QwenXmlParser::new();
563 let body = "<tool_call><function=ping></function></tool_call>";
564 let out = collect(&mut p, &[body]);
565 assert!(out.errors.is_empty());
566 assert_eq!(out.tool_calls.len(), 1);
567 assert_eq!(out.tool_calls[0].name, "ping");
568 assert_eq!(out.tool_calls[0].arguments, json!({}));
569 }
570}