mtmd: model: unlimited-ocr: converter + parity test (#24969)

2026-06-27 23:50:20 -05:00 · 2026-06-24 18:20:22 +02:00 · 2026-06-24 18:20:22 +02:00 · 894bb27af3
commit 894bb27af3
parent fb401045cc
3 changed files with 61 additions and 8 deletions
--- a/conversion/init.py
+++ b/conversion/init.py
@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "DbrxForCausalLM": "dbrx",
    "DeciLMForCausalLM": "deci",
    "DeepseekForCausalLM": "deepseek",
+    "DeepseekOCRForCausalLM": "deepseek",
    "DeepseekV2ForCausalLM": "deepseek",
    "DeepseekV3ForCausalLM": "deepseek",
    "DeepseekV32ForCausalLM": "deepseek",
@ -233,6 +234,7 @@ TEXT_MODEL_MAP: dict[str, str] = {
    "UMT5ForConditionalGeneration": "t5",
    "UMT5Model": "t5",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VLlama3ForCausalLM": "llama",
    "VoxtralForConditionalGeneration": "llama",
    "WavTokenizerDec": "wavtokenizer",
@ -299,6 +301,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = {
    "StepVLForConditionalGeneration": "step3",
    "Step3p7ForConditionalGeneration": "step3",
    "UltravoxModel": "ultravox",
+    "UnlimitedOCRForCausalLM": "deepseek",
    "VoxtralForConditionalGeneration": "ultravox",
    "YoutuVLForConditionalGeneration": "youtuvl",
 }
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger
 from .qwen import QwenModel


-@ModelBase.register("DeepseekOCRForCausalLM")
+@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
@ -205,6 +205,8 @@ class DeepseekModel(TextModel):
@ModelBase.register(
    "DeepseekV2ForCausalLM",
    "DeepseekV3ForCausalLM",
+    "DeepseekOCRForCausalLM",
+    "UnlimitedOCRForCausalLM",
    "KimiVLForConditionalGeneration",
    "KimiK25ForConditionalGeneration",
    "YoutuForCausalLM",
@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel):
        self.origin_hf_arch = hparams.get('architectures', [None])[0]

        # special handling for Deepseek OCR
-        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"):
+        if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"):
            self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR
            self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch]
            self.gguf_writer.add_architecture()
@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel):

        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

+        # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA)
+        if is_ocr:
+            sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window")
+            if sliding_window:
+                self.gguf_writer.add_sliding_window(sliding_window)
+
        if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None:
            # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX]
            # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul
--- a/tools/mtmd/tests/test-deepseek-ocr.py
+++ b/tools/mtmd/tests/test-deepseek-ocr.py
@ -9,6 +9,7 @@ its output, and holds them against the HF model's scores.

 import argparse
 import logging
+import re
 import subprocess
 import sys
 import unicodedata
@ -28,6 +29,12 @@ class ModelSpec:
    mmproj_arg: str
    model_default: str
    mmproj_default: str
+    prompt: str = "Free OCR. "
+    n_predict: int = 512
+    n_ctx: int | None = None
+    # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that
+    # the HF reference strips in result.md; drop it before scoring to match.
+    strip_grounding: bool = False


@dataclass
@ -63,6 +70,20 @@ MODELS = {
        model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf",
        mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf",
    ),
+    "unlimited": ModelSpec(
+        key="unlimited", label="Unlimited-OCR",
+        model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited",
+        model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf",
+        mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf",
+        # "Free OCR." immediately emits EOS on this checkpoint; the HF reference
+        # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds.
+        prompt="document parsing.",
+        # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget
+        # and context to reach the article body the ground truth covers.
+        n_predict=4096,
+        n_ctx=16384,
+        strip_grounding=True,
+    ),
 }

 CASES = [
@ -82,9 +103,26 @@ CASES = [
        # is one pixel off and lands at ~0.69 instead.
        hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0,
    ),
+    TestCase(
+        model_key="unlimited", label="single-view scan",
+        image="tools/mtmd/test-1.jpeg",
+        ground_truth="tools/mtmd/tests/test-1-ground-truth.txt",
+        # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth.
+        # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance.
+        hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0,
+    ),
 ]


+GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL)
+
+
+def strip_grounding(text: str) -> str:
+    """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the
+    cleaned result.md the HF reference scores against."""
+    return GROUNDING_TAG_RE.sub("", text)
+
+
 def arg_dest(flag: str) -> str:
    return flag.lstrip("-").replace("-", "_")

@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float:
    return CHRF().sentence_score(ocr_out, [expected]).score


-def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
+def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str:
    """Run mtmd-cli on the image and return its output."""
    cmd = [
        str(bin_path),
        "-m", str(model_path),
        "--mmproj", str(mmproj_path),
        "--image", str(image_path),
-        "-p", "Free OCR. ",
+        "-p", spec.prompt,
        "--chat-template", "deepseek-ocr",
        "--temp", "0",
        "--flash-attn", "off",  # match the HF "eager" attention reference
        "--no-warmup",
-        "-n", "512",  # cap loops on hard images (KV would otherwise fill)
+        "-n", str(spec.n_predict),  # cap loops on hard images (KV would otherwise fill)
        # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY.
        # Default DRY breakers include "\n", so they are cleared below.
        "--dry-multiplier", "0.8",
@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
        "--dry-penalty-last-n", "-1",
        "--dry-sequence-breaker", "none",
    ]
+    if spec.n_ctx is not None:
+        cmd += ["-c", str(spec.n_ctx)]
    logger.debug(f"  command: {' '.join(cmd)}")

    try:
@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str:
        raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}")

    output = result.stdout.decode("utf-8", errors="replace").strip()
+    if spec.strip_grounding:
+        output = strip_grounding(output)
    if not output:
        raise RuntimeError("llama-mtmd-cli produced no output on stdout")
    logger.info(f"  output: {len(output)} chars")
@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool:

    logger.info("")
    logger.info("=" * 60)
-    logger.info("Free OCR evaluation:")
+    logger.info("OCR evaluation:")
    logger.info("=" * 60)
    logger.info(f"  CER               {cer:>7.4f}    (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f}  -> {verdict(cer_pass)})")
    logger.info(f"  chrF (0-100)      {chrf:>7.2f}    (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f}  -> {verdict(chrf_pass)})")
@ -269,9 +311,9 @@ def main() -> int:
        expected = read_expected_text(ground_truth)
        logger.info(f"  Image: {case.image}")
        logger.info(f"  Expected text: {len(expected)} chars")
-        logger.info("  Running llama.cpp 'Free OCR'")
+        logger.info(f"  Running llama.cpp prompt {model_spec.prompt!r}")
        try:
-            ocr_out = run_mtmd_cli(model, mmproj, image, binary)
+            ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary)
        except RuntimeError as e:
            logger.error(f"  Error: {e}")
            results[title] = False