From 99cc9d0ccc5de817556b969cb1b7f30e71602bc9 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 8 Jun 2026 08:27:53 -0700
Subject: [PATCH 1/4] feat(example): support server video inputs and Gemma text
 tool calls

---
 CHANGELOG.md              |   1 +
 examples/server/server.py | 328 +++++++++++++++++++++++++++++++-------
 2 files changed, 270 insertions(+), 59 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 10209aacf..e733f37e6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- feat(example): support server video inputs and Gemma text tool calls by @abetlen in #2291
 - feat: update llama.cpp to ggml-org/llama.cpp@8f83d6c27
 - fix(example): support multi-step Responses tool streaming by @abetlen in #2288
 - fix(ci): Repair Linux accelerator wheels for manylinux publishing
diff --git a/examples/server/server.py b/examples/server/server.py
index fb00501cf..dc25f4644 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -3224,6 +3224,7 @@ class MTMDOptions(BaseModel):
         allowed_local_media_path: Optional[str] = None
         image_max_bytes: int = Field(default=20 * 1024 * 1024, ge=1)
         audio_max_bytes: int = Field(default=100 * 1024 * 1024, ge=1)
+        video_max_bytes: int = Field(default=512 * 1024 * 1024, ge=1)
         image_timeout_seconds: float = Field(default=10.0, gt=0.0)
 
         @model_validator(mode="after")
@@ -3350,7 +3351,7 @@ def load(cls, path: str) -> "ConfigFile":
 
 @dataclass(frozen=True)
 class MediaInput:
-    kind: Literal["image", "audio"]
+    kind: Literal["image", "audio", "video"]
     url: Optional[str] = None
     data: Optional[str] = None
     format: Optional[str] = None
@@ -3426,6 +3427,33 @@ def media_inputs_from_messages(
                         )
                     )
                     continue
+                if part_type in {"video_url", "input_video", "video"}:
+                    input_video = part.get("input_video")
+                    if isinstance(input_video, dict):
+                        data = input_video.get("data")
+                        video_url = input_video.get("video_url") or input_video.get("url")
+                        video_format = input_video.get("format")
+                    else:
+                        data = part.get("data")
+                        video_url = part.get("video_url") or part.get("url")
+                        video_format = part.get("format")
+                    if isinstance(video_url, dict):
+                        video_url = video_url.get("url")
+                    if isinstance(data, str):
+                        if video_format is not None and not isinstance(video_format, str):
+                            raise ValueError("input_video format must be a string")
+                        media_inputs.append(
+                            MediaInput(
+                                kind="video",
+                                data=data,
+                                format=cast(Optional[str], video_format),
+                            )
+                        )
+                    elif isinstance(video_url, str):
+                        media_inputs.append(MediaInput(kind="video", url=video_url))
+                    else:
+                        raise ValueError("video content part requires base64 data or a URL string")
+                    continue
         return media_inputs
 
     @staticmethod
@@ -3447,7 +3475,16 @@ def _content_part_for_template(
         if not isinstance(part, dict):
             raise ValueError("content parts must be strings or objects")
         part_type = part.get("type")
-        if part_type in {"image_url", "input_image", "image", "audio_url", "input_audio"}:
+        if part_type in {
+            "image_url",
+            "input_image",
+            "image",
+            "audio_url",
+            "input_audio",
+            "video_url",
+            "input_video",
+            "video",
+        }:
             if media_marker is None:
                 raise ValueError("multimodal content requires model.mtmd")
             return media_marker
@@ -3694,7 +3731,7 @@ class Media:
         positions: np.ndarray
         non_causal: bool = False
 
-    kind: Literal["text", "image", "audio"]
+    kind: Literal["text", "image", "audio", "video"]
     start_pos: int
     n_pos: int
     identity_tokens: List[int]
@@ -5116,6 +5153,87 @@ def _raw_string_tool_arguments(self, tool_name: str, value: str) -> Optional[Dic
             return None
         return None
 
+    def _single_string_tool_argument_name(self, tool_name: str) -> Optional[str]:
+        if self._tools is None:
+            return None
+        for tool in self._tools:
+            if tool.get("type") != "function":
+                continue
+            function = tool.get("function", {})
+            if function.get("name") != tool_name:
+                continue
+            parameters = function.get("parameters")
+            if not isinstance(parameters, dict):
+                return None
+            required = parameters.get("required")
+            if not isinstance(required, list) or len(required) != 1:
+                return None
+            argument_name = required[0]
+            if not isinstance(argument_name, str):
+                return None
+            properties = parameters.get("properties")
+            if not isinstance(properties, dict):
+                return None
+            argument_schema = properties.get(argument_name)
+            if not isinstance(argument_schema, dict):
+                return None
+            argument_type = argument_schema.get("type")
+            if argument_type == "string" or (
+                isinstance(argument_type, list) and "string" in argument_type
+            ):
+                return argument_name
+            return None
+        return None
+
+    def _text_tool_argument_from_object(
+        self,
+        tool_name: str,
+        arguments: Dict[str, Any],
+    ) -> Optional[str]:
+        input_value = arguments.get("input")
+        if isinstance(input_value, str):
+            return input_value
+        argument_name = self._single_string_tool_argument_name(tool_name)
+        if argument_name is not None:
+            argument_value = arguments.get(argument_name)
+            if isinstance(argument_value, str):
+                return argument_value
+        if len(arguments) == 1:
+            argument_value = next(iter(arguments.values()))
+            if isinstance(argument_value, str):
+                return argument_value
+        return None
+
+    def _text_tool_arguments(
+        self,
+        tool_name: str,
+        arguments: Any,
+        *,
+        partial: bool,
+    ) -> Optional[str]:
+        if isinstance(arguments, str):
+            parsed_arguments = self._raw_object_tool_arguments(arguments)
+            if parsed_arguments is not None:
+                text = self._text_tool_argument_from_object(tool_name, parsed_arguments)
+                if text is not None:
+                    return text
+                if partial:
+                    return None
+                return json.dumps(parsed_arguments, ensure_ascii=False, separators=(",", ":"))
+            return arguments
+        if isinstance(arguments, ResponseParser.PartialJsonObject):
+            arguments = arguments.value
+        if isinstance(arguments, dict):
+            text = self._text_tool_argument_from_object(tool_name, arguments)
+            if text is not None:
+                return text
+            if partial:
+                return None
+            return json.dumps(arguments, ensure_ascii=False, separators=(",", ":"))
+        if partial:
+            return None
+        return str(arguments)
+
     @classmethod
     def _raw_object_tool_arguments(cls, value: str) -> Optional[Dict[str, Any]]:
         candidates = [value]
@@ -6950,13 +7068,13 @@ def _normalize_tool_call_item(
                 "tool_calls function name must be a non-empty string"
             )
         if self._tool_content_type(tool_name) == "text":
-            arguments = function.get("arguments", "")
-            if not isinstance(arguments, str):
-                if partial:
-                    return None
-                raise CompletionResponseParsingError(
-                    "tool_calls function arguments must be a string for text tools"
-                )
+            arguments = self._text_tool_arguments(
+                tool_name,
+                function.get("arguments", ""),
+                partial=partial,
+            )
+            if arguments is None:
+                return None
             return {
                 "type": tool_call.get("type", "function"),
                 "function": {
@@ -7141,12 +7259,13 @@ def _parsed_chat_message(
             for tool_call_index, tool_call in enumerate(tool_calls):
                 function = tool_call["function"]
                 if self._tool_content_type(function["name"]) == "text":
-                    raw_arguments = function["arguments"]
-                    arguments = (
-                        raw_arguments
-                        if isinstance(raw_arguments, str)
-                        else str(raw_arguments)
+                    arguments = self._text_tool_arguments(
+                        function["name"],
+                        function["arguments"],
+                        partial=partial,
                     )
+                    if arguments is None:
+                        continue
                 else:
                     arguments = self._serialize_tool_arguments(
                         function["arguments"],
@@ -10195,7 +10314,7 @@ def _build_key(
         *,
         model_fingerprint: str,
         mmproj_fingerprint: str,
-        kind: Literal["image", "audio"],
+        kind: Literal["image", "audio", "video"],
         media_bytes: bytes,
     ) -> str:
         digest = hashlib.sha256(f"{kind}:".encode("utf-8") + media_bytes).hexdigest()
@@ -10210,7 +10329,7 @@ def _build_key(
         )
         return hashlib.sha256(payload.encode("utf-8")).hexdigest()
 
-    def key_for_media(self, kind: Literal["image", "audio"], media_bytes: bytes) -> str:
+    def key_for_media(self, kind: Literal["image", "audio", "video"], media_bytes: bytes) -> str:
         return self._build_key(
             model_fingerprint=self.model_fingerprint,
             mmproj_fingerprint=self.mmproj_fingerprint,
@@ -10273,6 +10392,17 @@ def evict_if_needed(self) -> None:
                 continue
 
 
+@dataclass
+class MTMDLoadedMedia:
+    media: MediaInput
+    media_bytes: bytes
+    key: str
+    bitmap: Any
+    video_ctx: Optional[Any] = None
+    video_frame_count: int = 0
+    video_frames_used: int = 0
+
+
 class MTMDProcessor:
     def __init__(
         self,
@@ -10291,6 +10421,7 @@ def __init__(
         allowed_local_media_path: Optional[str],
         image_max_bytes: int,
         audio_max_bytes: int,
+        video_max_bytes: int,
         image_timeout_seconds: float,
     ) -> None:
         self.chat_formatter = chat_formatter
@@ -10314,6 +10445,7 @@ def __init__(
         )
         self.image_max_bytes = image_max_bytes
         self.audio_max_bytes = audio_max_bytes
+        self.video_max_bytes = video_max_bytes
         self.image_timeout_seconds = image_timeout_seconds
         self.lock = threading.Lock()
         params = mtmd_cpp.mtmd_context_params_default()
@@ -10327,23 +10459,33 @@ def __init__(
             raise RuntimeError(f"failed to load MTMD context: {mmproj_path}")
         self.supports_vision = bool(mtmd_cpp.mtmd_support_vision(self.ctx))
         self.supports_audio = bool(mtmd_cpp.mtmd_support_audio(self.ctx))
+        self.supports_video = self.supports_vision and bool(
+            mtmd_cpp.mtmd_helper_support_video(self.ctx)
+        )
         if not self.supports_vision and not self.supports_audio:
             mtmd_cpp.mtmd_free(self.ctx)
             self.ctx = None
             raise RuntimeError(f"MTMD projector does not support image or audio input: {mmproj_path}")
-        self.media_marker = mtmd_cpp.mtmd_default_marker().decode("utf-8")
+        media_marker = mtmd_cpp.mtmd_get_marker(self.ctx)
+        if media_marker is None:
+            mtmd_cpp.mtmd_free(self.ctx)
+            self.ctx = None
+            raise RuntimeError(f"MTMD projector does not expose a media marker: {mmproj_path}")
+        self.media_marker = media_marker.decode("utf-8")
 
     def close(self) -> None:
         if self.ctx is not None:
             mtmd_cpp.mtmd_free(self.ctx)
             self.ctx = None
 
-    def _max_bytes_for_media(self, kind: Literal["image", "audio"]) -> int:
+    def _max_bytes_for_media(self, kind: Literal["image", "audio", "video"]) -> int:
         if kind == "image":
             return self.image_max_bytes
-        return self.audio_max_bytes
+        if kind == "audio":
+            return self.audio_max_bytes
+        return self.video_max_bytes
 
-    def _load_media_file(self, kind: Literal["image", "audio"], media_url: str) -> bytes:
+    def _load_media_file(self, kind: Literal["image", "audio", "video"], media_url: str) -> bytes:
         if self.allowed_local_media_path is None:
             raise CompletionRequestValidationError("local media path is not allowed")
         parsed = urllib.parse.urlsplit(media_url)
@@ -10391,7 +10533,7 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):  # type: ignore
         opener = urllib.request.build_opener(NoRedirectHandler)
         return opener.open(request, timeout=timeout)
 
-    def _load_media_url(self, kind: Literal["image", "audio"], media_url: str) -> bytes:
+    def _load_media_url(self, kind: Literal["image", "audio", "video"], media_url: str) -> bytes:
         max_bytes = self._max_bytes_for_media(kind)
         if media_url.startswith("data:"):
             try:
@@ -10423,29 +10565,65 @@ def _load_media_url(self, kind: Literal["image", "audio"], media_url: str) -> by
     def load_media(self, media: MediaInput) -> bytes:
         if media.url is not None:
             return self._load_media_url(media.kind, media.url)
-        if media.kind != "audio" or media.data is None:
+        if media.kind not in {"audio", "video"} or media.data is None:
             raise CompletionRequestValidationError(f"{media.kind} input requires a URL")
         try:
             data = base64.b64decode(media.data, validate=False)
         except (ValueError, binascii.Error) as exc:
-            raise CompletionRequestValidationError("input_audio data must be valid base64") from exc
-        if len(data) > self.audio_max_bytes:
-            raise CompletionRequestValidationError("audio exceeds model.mtmd.audio_max_bytes")
+            raise CompletionRequestValidationError(f"input_{media.kind} data must be valid base64") from exc
+        max_bytes = self._max_bytes_for_media(media.kind)
+        if len(data) > max_bytes:
+            raise CompletionRequestValidationError(
+                f"{media.kind} exceeds model.mtmd.{media.kind}_max_bytes"
+            )
         return data
 
-    def _create_bitmap(self, media_bytes: bytes, kind: Literal["image", "audio"]) -> Any:
+    def _create_loaded_media(
+        self,
+        media: MediaInput,
+        media_bytes: bytes,
+    ) -> MTMDLoadedMedia:
+        key = (
+            self.embedding_cache.key_for_media(media.kind, media_bytes)
+            if self.embedding_cache is not None
+            else MTMDEmbeddingCache._build_key(
+                model_fingerprint=self.model_fingerprint,
+                mmproj_fingerprint=self.mmproj_fingerprint,
+                kind=media.kind,
+                media_bytes=media_bytes,
+            )
+        )
         buffer = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes)
-        bitmap = mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
+        wrapper = mtmd_cpp.mtmd_helper_bitmap_init_from_buf_wrapper(
             self.ctx,
             buffer,
             len(media_bytes),
             False,
         )
+        bitmap = wrapper.bitmap
         if bitmap is None:
-            raise CompletionRequestValidationError(f"failed to create MTMD {kind} bitmap")
-        return bitmap
+            raise CompletionRequestValidationError(f"failed to create MTMD {media.kind} bitmap")
+        mtmd_cpp.mtmd_bitmap_set_id(bitmap, key.encode("utf-8"))
+        video_frame_count = 0
+        video_ctx = wrapper.video_ctx
+        if video_ctx:
+            video_info = mtmd_cpp.mtmd_helper_video_get_info(video_ctx)
+            video_frame_count = max(0, int(video_info.n_frames))
+        return MTMDLoadedMedia(
+            media=media,
+            media_bytes=media_bytes,
+            key=key,
+            bitmap=bitmap,
+            video_ctx=video_ctx,
+            video_frame_count=video_frame_count,
+        )
 
-    def _media_identity_tokens(self, kind: Literal["image", "audio"], key: str, n_pos: int) -> List[int]:
+    def _media_identity_tokens(
+        self,
+        kind: Literal["image", "audio", "video"],
+        key: str,
+        n_pos: int,
+    ) -> List[int]:
         tokens: List[int] = []
         for index in range(n_pos):
             digest = hashlib.sha256(f"{kind}:{key}:{index}".encode("utf-8")).digest()
@@ -10455,7 +10633,7 @@ def _media_identity_tokens(self, kind: Literal["image", "audio"], key: str, n_po
     def _encode_media_chunk(
         self,
         *,
-        kind: Literal["image", "audio"],
+        kind: Literal["image", "audio", "video"],
         key: str,
         chunk: Any,
     ) -> np.ndarray:
@@ -10535,6 +10713,8 @@ def build_prompt_plan(
             raise CompletionRequestValidationError("MTMD projector does not support images")
         if any(media.kind == "audio" for media in media_inputs) and not self.supports_audio:
             raise CompletionRequestValidationError("MTMD projector does not support audio")
+        if any(media.kind == "video" for media in media_inputs) and not self.supports_video:
+            raise CompletionRequestValidationError("MTMD projector does not support video")
         with self.lock:
             return self._build_prompt_plan_locked(
                 messages=messages,
@@ -10567,13 +10747,19 @@ def _build_prompt_plan_locked(
             reasoning_effort=reasoning_effort,
         )
         media_bytes_by_index = [self.load_media(media) for media in media_inputs]
-        bitmaps: List[Any] = []
+        loaded_media: List[MTMDLoadedMedia] = []
         chunks: Optional[Any] = None
         try:
-            bitmaps = [
-                self._create_bitmap(media_bytes, media.kind)
+            loaded_media = [
+                self._create_loaded_media(media, media_bytes)
                 for media, media_bytes in zip(media_inputs, media_bytes_by_index)
             ]
+            loaded_media_by_key = {media.key: media for media in loaded_media}
+            video_media = [media for media in loaded_media if media.media.kind == "video"]
+            if len(video_media) > 1 and any(media.video_frame_count <= 0 for media in video_media):
+                raise CompletionRequestValidationError(
+                    "multiple videos require MTMD to report frame counts"
+                )
             input_text = mtmd_cpp.mtmd_input_text()
             input_text.text = prompt.encode("utf-8")
             input_text.add_special = False
@@ -10581,7 +10767,9 @@ def _build_prompt_plan_locked(
             chunks = mtmd_cpp.mtmd_input_chunks_init()
             if chunks is None:
                 raise CompletionRequestValidationError("failed to create MTMD input chunks")
-            bitmap_array = (mtmd_cpp.mtmd_bitmap_p_ctypes * len(bitmaps))(*bitmaps)
+            bitmap_array = (mtmd_cpp.mtmd_bitmap_p_ctypes * len(loaded_media))(
+                *(media.bitmap for media in loaded_media)
+            )
             result = int(
                 mtmd_cpp.mtmd_tokenize(
                     self.ctx,
@@ -10601,7 +10789,8 @@ def _build_prompt_plan_locked(
             text_token_index_by_pos: Dict[int, int] = {}
             identity_pos = 0
             decode_pos = 0
-            media_index = 0
+            video_index = 0
+            used_media_keys = set()
             n_chunks = int(mtmd_cpp.mtmd_input_chunks_size(chunks))
             for chunk_index in range(n_chunks):
                 chunk = mtmd_cpp.mtmd_input_chunks_get(chunks, chunk_index)
@@ -10640,32 +10829,50 @@ def _build_prompt_plan_locked(
                         decode_pos += len(tokens)
                     continue
                 if chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE:
-                    kind: Literal["image", "audio"] = "image"
+                    chunk_kind: Literal["image", "audio"] = "image"
                     if not self.supports_vision:
                         raise CompletionRequestValidationError("MTMD projector does not support images")
                 elif chunk_type == mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO:
-                    kind = "audio"
+                    chunk_kind = "audio"
                     if not self.supports_audio:
                         raise CompletionRequestValidationError("MTMD projector does not support audio")
                 else:
                     raise CompletionRequestValidationError("unsupported MTMD input chunk type")
-                if media_index >= len(media_bytes_by_index):
-                    raise CompletionRequestValidationError("MTMD media chunk count mismatch")
-                media = media_inputs[media_index]
-                media_bytes = media_bytes_by_index[media_index]
-                media_index += 1
-                if media.kind != kind:
-                    raise CompletionRequestValidationError("MTMD media chunk modality mismatch")
-                key = (
-                    self.embedding_cache.key_for_media(kind, media_bytes)
-                    if self.embedding_cache is not None
-                    else MTMDEmbeddingCache._build_key(
-                        model_fingerprint=self.model_fingerprint,
-                        mmproj_fingerprint=self.mmproj_fingerprint,
-                        kind=kind,
-                        media_bytes=media_bytes,
-                    )
-                )
+                chunk_id_bytes = mtmd_cpp.mtmd_input_chunk_get_id(chunk)
+                chunk_id = chunk_id_bytes.decode("utf-8") if chunk_id_bytes else ""
+                media = loaded_media_by_key.get(chunk_id)
+                video_frame_index: Optional[int] = None
+                if media is None and chunk_kind == "image" and video_media:
+                    while (
+                        video_index < len(video_media)
+                        and video_media[video_index].video_frame_count > 0
+                        and video_media[video_index].video_frames_used
+                        >= video_media[video_index].video_frame_count
+                    ):
+                        video_index += 1
+                    if video_index >= len(video_media):
+                        raise CompletionRequestValidationError("MTMD video frame count mismatch")
+                    media = video_media[video_index]
+                    video_frame_index = media.video_frames_used
+                    media.video_frames_used += 1
+                if media is None:
+                    raise CompletionRequestValidationError("MTMD media chunk identity mismatch")
+                if media.media.kind == "video":
+                    if chunk_kind != "image":
+                        raise CompletionRequestValidationError("MTMD video chunk modality mismatch")
+                    kind: Literal["image", "audio", "video"] = "video"
+                    if video_frame_index is None:
+                        video_frame_index = media.video_frames_used
+                        media.video_frames_used += 1
+                    key = hashlib.sha256(
+                        f"{media.key}:frame:{video_frame_index}".encode("utf-8")
+                    ).hexdigest()
+                else:
+                    if media.media.kind != chunk_kind:
+                        raise CompletionRequestValidationError("MTMD media chunk modality mismatch")
+                    kind = media.media.kind
+                    key = media.key
+                used_media_keys.add(media.key)
                 decode_n_pos = int(mtmd_cpp.mtmd_input_chunk_get_n_pos(chunk))
                 if decode_n_pos <= 0:
                     raise CompletionRequestValidationError("MTMD media chunk has no decoder positions")
@@ -10698,7 +10905,7 @@ def _build_prompt_plan_locked(
                 identity_tokens.extend(segment_identity)
                 identity_pos += n_tokens
                 decode_pos += decode_n_pos
-            if media_index != len(media_bytes_by_index):
+            if used_media_keys != {media.key for media in loaded_media}:
                 raise CompletionRequestValidationError("not all media inputs were consumed by MTMD")
             return PromptPlan(
                 text=prompt,
@@ -10711,8 +10918,10 @@ def _build_prompt_plan_locked(
         finally:
             if chunks is not None:
                 mtmd_cpp.mtmd_input_chunks_free(chunks)
-            for bitmap in bitmaps:
-                mtmd_cpp.mtmd_bitmap_free(bitmap)
+            for media in loaded_media:
+                mtmd_cpp.mtmd_bitmap_free(media.bitmap)
+                if media.video_ctx:
+                    mtmd_cpp.mtmd_helper_video_free(media.video_ctx)
 
 
 class Model:
@@ -15927,6 +16136,7 @@ def main() -> None:
             allowed_local_media_path=config.model.mtmd.allowed_local_media_path,
             image_max_bytes=config.model.mtmd.image_max_bytes,
             audio_max_bytes=config.model.mtmd.audio_max_bytes,
+            video_max_bytes=config.model.mtmd.video_max_bytes,
             image_timeout_seconds=config.model.mtmd.image_timeout_seconds,
         )
     sequence_cache: Optional[SequenceCache] = None

From e6a6dc93b961b9d1f7399afb363b195b1e1fc2a2 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 8 Jun 2026 20:54:24 -0700
Subject: [PATCH 2/4] fix(example): pass media count to MTMD tokenizer

---
 examples/server/server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.py b/examples/server/server.py
index dc25f4644..b0ea8382c 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -10776,7 +10776,7 @@ def _build_prompt_plan_locked(
                     chunks,
                     ctypes.byref(input_text),
                     bitmap_array,
-                    len(bitmaps),
+                    len(loaded_media),
                 )
             )
             if result != 0:

From 486987fb8538978c4aaf4579bdd7979efa25b90b Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 8 Jun 2026 21:00:38 -0700
Subject: [PATCH 3/4] fix(example): avoid MTMD video stdin double close

---
 examples/server/server.py | 77 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)

diff --git a/examples/server/server.py b/examples/server/server.py
index b0ea8382c..99268a225 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -37,6 +37,7 @@
 import shutil
 import inspect
 import sys
+import tempfile
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -10399,6 +10400,8 @@ class MTMDLoadedMedia:
     key: str
     bitmap: Any
     video_ctx: Optional[Any] = None
+    video_temp_path: Optional[Path] = None
+    video_callback: Optional[Any] = None
     video_frame_count: int = 0
     video_frames_used: int = 0
 
@@ -10593,6 +10596,8 @@ def _create_loaded_media(
                 media_bytes=media_bytes,
             )
         )
+        if media.kind == "video":
+            return self._create_loaded_video_media(media, media_bytes, key)
         buffer = (ctypes.c_uint8 * len(media_bytes)).from_buffer_copy(media_bytes)
         wrapper = mtmd_cpp.mtmd_helper_bitmap_init_from_buf_wrapper(
             self.ctx,
@@ -10618,6 +10623,73 @@ def _create_loaded_media(
             video_frame_count=video_frame_count,
         )
 
+    @staticmethod
+    def _video_temp_suffix(media: MediaInput) -> str:
+        extension = (media.format or "mp4").lstrip(".").lower()
+        if not extension or any(not char.isalnum() for char in extension):
+            extension = "video"
+        return f".{extension}"
+
+    def _create_loaded_video_media(
+        self,
+        media: MediaInput,
+        media_bytes: bytes,
+        key: str,
+    ) -> MTMDLoadedMedia:
+        temp_file = tempfile.NamedTemporaryFile(
+            prefix="llama-cpp-python-mtmd-",
+            suffix=self._video_temp_suffix(media),
+            delete=False,
+        )
+        temp_path = Path(temp_file.name)
+        try:
+            with temp_file:
+                temp_file.write(media_bytes)
+            params = mtmd_cpp.mtmd_helper_video_init_params_default()
+            video_ctx = mtmd_cpp.mtmd_helper_video_init(
+                self.ctx,
+                str(temp_path).encode("utf-8"),
+                params,
+            )
+            if video_ctx is None:
+                raise CompletionRequestValidationError("failed to create MTMD video context")
+
+            def read_next(
+                _chunk_index: int,
+                _user_data: Any,
+                out_bitmap: Any,
+                out_text: Any,
+            ) -> int:
+                return int(mtmd_cpp.mtmd_helper_video_read_next(video_ctx, out_bitmap, out_text))
+
+            callback = mtmd_cpp.mtmd_bitmap_lazy_callback(read_next)
+            bitmap = mtmd_cpp.mtmd_bitmap_init_lazy(
+                self.ctx,
+                key.encode("utf-8"),
+                ctypes.c_void_p(),
+                callback,
+            )
+            if bitmap is None:
+                mtmd_cpp.mtmd_helper_video_free(video_ctx)
+                raise CompletionRequestValidationError("failed to create MTMD video bitmap")
+            video_info = mtmd_cpp.mtmd_helper_video_get_info(video_ctx)
+            return MTMDLoadedMedia(
+                media=media,
+                media_bytes=media_bytes,
+                key=key,
+                bitmap=bitmap,
+                video_ctx=video_ctx,
+                video_temp_path=temp_path,
+                video_callback=callback,
+                video_frame_count=max(0, int(video_info.n_frames)),
+            )
+        except Exception:
+            try:
+                temp_path.unlink()
+            except OSError:
+                pass
+            raise
+
     def _media_identity_tokens(
         self,
         kind: Literal["image", "audio", "video"],
@@ -10922,6 +10994,11 @@ def _build_prompt_plan_locked(
                 mtmd_cpp.mtmd_bitmap_free(media.bitmap)
                 if media.video_ctx:
                     mtmd_cpp.mtmd_helper_video_free(media.video_ctx)
+                if media.video_temp_path is not None:
+                    try:
+                        media.video_temp_path.unlink()
+                    except OSError:
+                        pass
 
 
 class Model:

From de514d773acd5776cd14120544338bcc99d24911 Mon Sep 17 00:00:00 2001
From: abetlen <abetlen@gmail.com>
Date: Mon, 8 Jun 2026 21:10:34 -0700
Subject: [PATCH 4/4] fix(example): parse Gemma text tool bracket calls

---
 examples/server/configs/gemma-4-12b-it-qat.json | 6 +++---
 examples/server/server.py                       | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/server/configs/gemma-4-12b-it-qat.json b/examples/server/configs/gemma-4-12b-it-qat.json
index fefeda8da..60ffd2a98 100644
--- a/examples/server/configs/gemma-4-12b-it-qat.json
+++ b/examples/server/configs/gemma-4-12b-it-qat.json
@@ -44,7 +44,7 @@
         },
         "tool_calls": {
           "type": "array",
-          "x-regex-iterator": "<\\|tool_call>(call:[^\\{]+\\{.*?\\})<tool_call\\|>",
+          "x-regex-iterator": "<\\|tool_call>(call:[^\\{\\[]+[\\{\\[].*?[\\}\\]])<tool_call\\|>",
           "items": {
             "type": "object",
             "properties": {
@@ -56,11 +56,11 @@
                 "properties": {
                   "name": {
                     "type": "string",
-                    "x-regex": "^call:(\\w+)\\{"
+                    "x-regex": "^call:(\\w+)[\\{\\[]"
                   },
                   "arguments": {
                     "type": "object",
-                    "x-regex": "^call:\\w+(\\{.*\\})$",
+                    "x-regex": "^call:\\w+(\\{.*\\}|\\[.*\\])$",
                     "x-parser": "gemma4-tool-call",
                     "additionalProperties": true
                   }
diff --git a/examples/server/server.py b/examples/server/server.py
index 99268a225..e8034a214 100644
--- a/examples/server/server.py
+++ b/examples/server/server.py
@@ -4389,6 +4389,9 @@ def capture(match: re.Match[str]) -> str:
             strings.append(match.group(1))
             return f"\x00{len(strings) - 1}\x00"
 
+        stripped = text.strip()
+        if stripped.startswith("[") and stripped.endswith("]"):
+            text = "{" + stripped[1:-1] + "}"
         text = re.sub(r'<\|"\|>(.*?)<\|"\|>', capture, text, flags=re.S)
         text = re.sub(r"(?<=[{,])(\w+):", r'"\1":', text)
         for index, value in enumerate(strings):