From fdda9bcb27c9456ae09db55282898ef5de307b6d Mon Sep 17 00:00:00 2001 From: Arman Naseri Date: Mon, 13 Apr 2026 16:12:47 +0200 Subject: [PATCH] Add chat template for gemma models For multimodal models, we need to change the user/assistant roles and add the proper start and end tokens --- llama_cpp/llama_chat_format.py | 94 ++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py index 1024fb85b..d819dae6b 100644 --- a/llama_cpp/llama_chat_format.py +++ b/llama_cpp/llama_chat_format.py @@ -3229,6 +3229,62 @@ def from_pretrained( ) +class GemmaChatHandler(Llava15ChatHandler): + """Chat handler for Gemma-based multimodal models (e.g., PaliGemma, MedGemma). + + Gemma models use / control tokens instead of + the LLaVA-style USER:/ASSISTANT: format. The text-only 'gemma' chat format + is already registered (see format_gemma), but multimodal Gemma models that + require a Llava-style vision pipeline need a dedicated handler so the + correct chat template is applied when chat_handler takes precedence over + chat_format in the resolution order. + + See: https://ai.google.dev/gemma/docs/formatting + """ + + DEFAULT_SYSTEM_MESSAGE = None # Gemma models do not natively support a system role + + CHAT_FORMAT = ( + "{% for message in messages %}" + # System messages are folded into a user turn (Gemma has no system role) + "{% if message.role == 'system' %}" + "user\n{{ message.content }}\n" + "{% endif %}" + # User message (handles both plain string and multimodal content list) + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable and message.content is not string %}" + # Emit image tokens first + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + # Then emit text tokens + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + # Assistant message + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + # Generation prompt + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) class ObsidianChatHandler(Llava15ChatHandler): # Prompt Format # The model followed ChatML format. However, with ### as the seperator @@ -3581,6 +3637,44 @@ def __call__(self, **kwargs): return super().__call__(**kwargs) +class MultimodalGemmaChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE: Optional[str] = None + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if message.role == 'user' %}" + "user\n" + "{% if message.content is string %}" + "{{ message.content }}" + "{% endif %}" + "{% if message.content is iterable %}" + "{% for content in message.content %}" + "{% if content.type == 'image_url' and content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.type == 'image_url' and content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endfor %}" + "{% for content in message.content %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "\n" + "{% endif %}" + "{% if message.role == 'assistant' and message.content is not none %}" + "model\n" + "{{ message.content }}\n" + "{% endif %}" + "{% endfor %}" + "{% if add_generation_prompt %}" + "model\n" + "{% endif %}" + ) + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama,