Fix GLM-OCR multimodal prompt rendering (#157)

leehack · web-flow · commit ce760d66ce46 · 2026-05-20T09:10:21.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,8 @@
 ## Unreleased
 
+* **Fixes**:
+  * Fixed GLM-OCR and other multimodal chat-template workarounds so image and
+    audio content parts are preserved when tool-call normalization runs.
 * **Testing**:
   * Added `tool/testing/run_local_e2e.dart` as a discovery and orchestration
     entry point for heavyweight local-only Dart E2E, Flutter device, and
diff --git a/lib/src/core/template/template_workarounds.dart b/lib/src/core/template/template_workarounds.dart
@@ -43,29 +43,37 @@ class TemplateWorkarounds {
     List<LlamaChatMessage> messages,
     ChatFormat format,
   ) {
+    final needsFuncArgsNormalization = _formatsNeedFuncArgsNormalization
+        .contains(format);
+    final needsGenericSchema = _formatsNeedGenericSchema.contains(format);
+    final needsMoveToolCallsToContent = _formatsNeedMoveToolCallsToContent
+        .contains(format);
+
+    if (!needsFuncArgsNormalization &&
+        !needsGenericSchema &&
+        !needsMoveToolCallsToContent) {
+      return messages;
+    }
+
+    if (!_hasTypedToolCalls(messages)) {
+      return messages;
+    }
+
     final jsonMessages = messages.map((m) => m.toJson()).toList();
-    var changed = false;
 
-    if (_formatsNeedFuncArgsNormalization.contains(format)) {
+    if (needsFuncArgsNormalization) {
       normalizeToolCallArgs(jsonMessages);
-      changed = true;
     }
 
-    if (_formatsNeedGenericSchema.contains(format)) {
+    if (needsGenericSchema) {
       useGenericSchema(jsonMessages);
-      changed = true;
     }
 
-    if (_formatsNeedMoveToolCallsToContent.contains(format)) {
+    if (needsMoveToolCallsToContent) {
       moveToolCallsToContent(jsonMessages);
-      changed = true;
-    }
-
-    if (!changed) {
-      return messages;
     }
 
-    return _messagesFromJson(jsonMessages);
+    return _messagesFromJson(jsonMessages, messages);
   }
 
   /// Ensures tool call arguments are JSON objects, not strings.
@@ -172,6 +180,12 @@ class TemplateWorkarounds {
     }
   }
 
+  static bool _hasTypedToolCalls(List<LlamaChatMessage> messages) {
+    return messages.any(
+      (message) => message.parts.any((part) => part is LlamaToolCallContent),
+    );
+  }
+
   static Map<String, dynamic> _argumentsToObject(Object? args) {
     final map = ToolCallParsingUtils.decodeJsonMapValue(args);
     if (map != null) {
@@ -193,11 +207,18 @@ class TemplateWorkarounds {
 
   static List<LlamaChatMessage> _messagesFromJson(
     List<Map<String, dynamic>> messages,
+    List<LlamaChatMessage> originals,
   ) {
-    return messages.map(_messageFromJson).toList();
+    return [
+      for (var i = 0; i < messages.length; i++)
+        _messageFromJson(messages[i], original: originals[i]),
+    ];
   }
 
-  static LlamaChatMessage _messageFromJson(Map<String, dynamic> message) {
+  static LlamaChatMessage _messageFromJson(
+    Map<String, dynamic> message, {
+    required LlamaChatMessage original,
+  }) {
     final role = _parseRole(message['role'] as String? ?? 'user');
     final parts = <LlamaContentPart>[];
 
@@ -244,10 +265,7 @@ class TemplateWorkarounds {
         ),
       );
     } else {
-      final text = _extractTextContent(content);
-      if (text.isNotEmpty) {
-        parts.add(LlamaTextContent(text));
-      }
+      parts.addAll(_extractContentParts(content, original: original));
     }
 
     if (parts.isEmpty) {
@@ -257,21 +275,68 @@ class TemplateWorkarounds {
     return LlamaChatMessage.withContent(role: role, content: parts);
   }
 
-  static String _extractTextContent(Object? content) {
-    if (content == null) return '';
-    if (content is String) return content;
-    if (content is! List) return content.toString();
+  static List<LlamaContentPart> _extractContentParts(
+    Object? content, {
+    required LlamaChatMessage original,
+  }) {
+    if (content == null) return const [];
+    if (content is String) {
+      return content.isEmpty ? const [] : [LlamaTextContent(content)];
+    }
+    if (content is! List) {
+      final text = content.toString();
+      return text.isEmpty ? const [] : [LlamaTextContent(text)];
+    }
+
+    final originalImages = original.parts
+        .whereType<LlamaImageContent>()
+        .toList();
+    final originalAudio = original.parts
+        .whereType<LlamaAudioContent>()
+        .toList();
+    var imageIndex = 0;
+    var audioIndex = 0;
+    final parts = <LlamaContentPart>[];
 
-    final buffer = StringBuffer();
     for (final item in content) {
-      if (item is Map<String, dynamic> && item['type'] == 'text') {
-        final text = item['text'];
-        if (text is String) {
-          buffer.write(text);
-        }
+      if (item is! Map<String, dynamic>) continue;
+      switch (item['type']) {
+        case 'text':
+          final text = item['text'];
+          if (text is String && text.isNotEmpty) {
+            parts.add(LlamaTextContent(text));
+          }
+          break;
+        case 'image':
+        case 'image_url':
+          if (imageIndex < originalImages.length) {
+            parts.add(originalImages[imageIndex++]);
+          } else {
+            parts.add(_imageContentFromJson(item));
+          }
+          break;
+        case 'input_audio':
+        case 'audio':
+          if (audioIndex < originalAudio.length) {
+            parts.add(originalAudio[audioIndex++]);
+          }
+          break;
       }
     }
-    return buffer.toString();
+
+    return parts;
+  }
+
+  static LlamaImageContent _imageContentFromJson(Map<String, dynamic> item) {
+    final imageUrl = item['image_url'];
+    final url = imageUrl is Map<String, dynamic> ? imageUrl['url'] : null;
+    if (url is String && url.startsWith('file://')) {
+      return LlamaImageContent(path: url.substring('file://'.length));
+    }
+    if (url is String && url.isNotEmpty) {
+      return LlamaImageContent(url: url);
+    }
+    return const LlamaImageContent();
   }
 
   static LlamaChatRole _parseRole(String role) {
diff --git a/test/unit/core/template/chat_template_engine_test.dart b/test/unit/core/template/chat_template_engine_test.dart
@@ -2,6 +2,7 @@ import 'dart:convert';
 
 import 'package:llamadart/src/core/models/chat/chat_message.dart';
 import 'package:llamadart/src/core/models/chat/chat_role.dart';
+import 'package:llamadart/src/core/models/chat/content_part.dart';
 import 'package:llamadart/src/core/models/inference/tool_choice.dart';
 import 'package:llamadart/src/core/models/tools/tool_definition.dart';
 import 'package:llamadart/src/core/models/tools/tool_param.dart';
@@ -27,6 +28,41 @@ void main() {
       expect(result.prompt, contains('CUSTOM:hello'));
       expect(result.prompt, isNot(contains('BASE:hello')));
     });
+
+    test('preserves GLM-OCR image markers through format workarounds', () {
+      const template = '''[gMASK]<sop>
+{# GLM detection marker: <arg_key>name</arg_key><arg_value>value</arg_value> #}
+{% for m in messages %}
+{% if m.role == 'user' %}<|user|>
+{% for item in m.content %}
+{% if item.type == 'image' %}<|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}{{ item.text }}{% endif %}
+{% endfor %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt %}<|assistant|>{% endif %}''';
+      const multimodalMessages = [
+        LlamaChatMessage.withContent(
+          role: LlamaChatRole.user,
+          content: [
+            LlamaImageContent(path: '/tmp/page.png'),
+            LlamaTextContent('Extract text.'),
+          ],
+        ),
+      ];
+
+      final result = ChatTemplateEngine.render(
+        templateSource: template,
+        messages: multimodalMessages,
+        metadata: const {},
+      );
+
+      expect(result.format, equals(ChatFormat.glm45.index));
+      expect(
+        result.prompt,
+        contains('<|begin_of_image|><__media__><|end_of_image|>'),
+      );
+      expect(result.prompt, contains('Extract text.'));
+    });
   });
 
   group('ChatTemplateEngine grammar routing', () {
diff --git a/test/unit/core/template/template_workarounds_test.dart b/test/unit/core/template/template_workarounds_test.dart
@@ -1,3 +1,5 @@
+import 'dart:typed_data';
+
 import 'package:llamadart/src/core/models/chat/chat_message.dart';
 import 'package:llamadart/src/core/models/chat/chat_role.dart';
 import 'package:llamadart/src/core/models/chat/content_part.dart';
@@ -123,6 +125,121 @@ void main() {
       expect(message['content'], contains('"weather"'));
     });
 
+    test(
+      'applyFormatWorkarounds returns before serializing byte-backed multimodal content without tool calls',
+      () {
+        final imageBytes = Uint8List.fromList([1, 2, 3, 4]);
+        final input = [
+          LlamaChatMessage.withContent(
+            role: LlamaChatRole.user,
+            content: [
+              LlamaImageContent(bytes: imageBytes, width: 1, height: 1),
+              const LlamaTextContent('Extract text.'),
+            ],
+          ),
+        ];
+
+        final output = TemplateWorkarounds.applyFormatWorkarounds(
+          input,
+          ChatFormat.glm45,
+        );
+
+        expect(identical(output, input), isTrue);
+        final image = output.first.parts.whereType<LlamaImageContent>().single;
+        expect(identical(image.bytes, imageBytes), isTrue);
+        expect(
+          output.first.parts.whereType<LlamaTextContent>().single.text,
+          equals('Extract text.'),
+        );
+      },
+    );
+
+    test(
+      'applyFormatWorkarounds preserves multimodal content when tool calls are normalized',
+      () {
+        const input = [
+          LlamaChatMessage.withContent(
+            role: LlamaChatRole.user,
+            content: [
+              LlamaImageContent(path: '/tmp/page.png'),
+              LlamaTextContent('Extract text.'),
+            ],
+          ),
+          LlamaChatMessage.withContent(
+            role: LlamaChatRole.assistant,
+            content: [
+              LlamaToolCallContent(
+                id: 'call_1',
+                name: 'lookup',
+                arguments: {'query': 'ocr'},
+                rawJson: '{"query":"ocr"}',
+              ),
+            ],
+          ),
+        ];
+
+        final output = TemplateWorkarounds.applyFormatWorkarounds(
+          input,
+          ChatFormat.glm45,
+        );
+
+        expect(output.first.parts[0], isA<LlamaImageContent>());
+        expect(
+          output.first.parts.whereType<LlamaTextContent>().single.text,
+          equals('Extract text.'),
+        );
+        final toolCall = output.last.parts
+            .whereType<LlamaToolCallContent>()
+            .single;
+        expect(toolCall.name, equals('lookup'));
+        expect(toolCall.arguments, equals({'query': 'ocr'}));
+      },
+    );
+
+    test(
+      'applyFormatWorkarounds preserves audio content when tool calls are normalized',
+      () {
+        final audioBytes = Uint8List.fromList([82, 73, 70, 70]);
+        final input = [
+          LlamaChatMessage.withContent(
+            role: LlamaChatRole.user,
+            content: [
+              LlamaAudioContent(bytes: audioBytes),
+              const LlamaTextContent('Transcribe audio.'),
+            ],
+          ),
+          const LlamaChatMessage.withContent(
+            role: LlamaChatRole.assistant,
+            content: [
+              LlamaToolCallContent(
+                id: 'call_1',
+                name: 'lookup',
+                arguments: {'query': 'audio'},
+                rawJson: '{"query":"audio"}',
+              ),
+            ],
+          ),
+        ];
+
+        final output = TemplateWorkarounds.applyFormatWorkarounds(
+          input,
+          ChatFormat.glm45,
+        );
+
+        final audio = output.first.parts.whereType<LlamaAudioContent>().single;
+        expect(identical(audio.bytes, audioBytes), isTrue);
+        expect(
+          output.first.parts.whereType<LlamaTextContent>().single.text,
+          equals('Transcribe audio.'),
+        );
+        final toolCall = output.last.parts
+            .whereType<LlamaToolCallContent>()
+            .single;
+        expect(toolCall.name, equals('lookup'));
+        expect(toolCall.arguments, equals({'query': 'audio'}));
+      },
+    );
+
     test('applyFormatWorkarounds applies Granite chain', () {
       final input = [
         LlamaChatMessage.withContent(
diff --git a/website/docs/changelog/recent-releases.md b/website/docs/changelog/recent-releases.md
@@ -9,6 +9,8 @@ For canonical full release notes, use:
 
 ## Unreleased
 
+- Fixed GLM-OCR and other multimodal chat-template workarounds so image and
+  audio content parts are preserved when tool-call normalization runs.
 - Added `tool/testing/run_local_e2e.dart` as a discovery and orchestration
   entry point for heavyweight local-only Dart E2E, Flutter device, and
   Web/Playwright smoke scenarios.