diff --git a/conversion/__init__.py b/conversion/__init__.py index 2bce1bbd7c..a587b1c37e 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -46,6 +46,7 @@ TEXT_MODEL_MAP: dict[str, str] = { "DbrxForCausalLM": "dbrx", "DeciLMForCausalLM": "deci", "DeepseekForCausalLM": "deepseek", + "DeepseekOCRForCausalLM": "deepseek", "DeepseekV2ForCausalLM": "deepseek", "DeepseekV3ForCausalLM": "deepseek", "DeepseekV32ForCausalLM": "deepseek", @@ -233,6 +234,7 @@ TEXT_MODEL_MAP: dict[str, str] = { "UMT5ForConditionalGeneration": "t5", "UMT5Model": "t5", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VLlama3ForCausalLM": "llama", "VoxtralForConditionalGeneration": "llama", "WavTokenizerDec": "wavtokenizer", @@ -299,6 +301,7 @@ MMPROJ_MODEL_MAP: dict[str, str] = { "StepVLForConditionalGeneration": "step3", "Step3p7ForConditionalGeneration": "step3", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VoxtralForConditionalGeneration": "ultravox", "YoutuVLForConditionalGeneration": "youtuvl", } diff --git a/conversion/deepseek.py b/conversion/deepseek.py index 72520cc9f6..4c93fb66df 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -14,7 +14,7 @@ from .base import MmprojModel, ModelBase, TextModel, gguf, logger from .qwen import QwenModel -@ModelBase.register("DeepseekOCRForCausalLM") +@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -205,6 +205,8 @@ class DeepseekModel(TextModel): @ModelBase.register( "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", + "DeepseekOCRForCausalLM", + "UnlimitedOCRForCausalLM", "KimiVLForConditionalGeneration", "KimiK25ForConditionalGeneration", "YoutuForCausalLM", @@ -224,7 +226,7 @@ class DeepseekV2Model(TextModel): self.origin_hf_arch = hparams.get('architectures', [None])[0] # special handling for Deepseek OCR - if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"): + if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"): self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] self.gguf_writer.add_architecture() @@ -350,6 +352,12 @@ class DeepseekV2Model(TextModel): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA) + if is_ocr: + sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5f5fef765a..ec0b4523be 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -9,6 +9,7 @@ its output, and holds them against the HF model's scores. import argparse import logging +import re import subprocess import sys import unicodedata @@ -28,6 +29,12 @@ class ModelSpec: mmproj_arg: str model_default: str mmproj_default: str + prompt: str = "Free OCR. " + n_predict: int = 512 + n_ctx: int | None = None + # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that + # the HF reference strips in result.md; drop it before scoring to match. + strip_grounding: bool = False @dataclass @@ -63,6 +70,20 @@ MODELS = { model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf", mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf", ), + "unlimited": ModelSpec( + key="unlimited", label="Unlimited-OCR", + model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited", + model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf", + mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf", + # "Free OCR." immediately emits EOS on this checkpoint; the HF reference + # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds. + prompt="document parsing.", + # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget + # and context to reach the article body the ground truth covers. + n_predict=4096, + n_ctx=16384, + strip_grounding=True, + ), } CASES = [ @@ -82,9 +103,26 @@ CASES = [ # is one pixel off and lands at ~0.69 instead. hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, ), + TestCase( + model_key="unlimited", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth. + # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance. + hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0, + ), ] +GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL) + + +def strip_grounding(text: str) -> str: + """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the + cleaned result.md the HF reference scores against.""" + return GROUNDING_TAG_RE.sub("", text) + + def arg_dest(flag: str) -> str: return flag.lstrip("-").replace("-", "_") @@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float: return CHRF().sentence_score(ocr_out, [expected]).score -def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: +def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str: """Run mtmd-cli on the image and return its output.""" cmd = [ str(bin_path), "-m", str(model_path), "--mmproj", str(mmproj_path), "--image", str(image_path), - "-p", "Free OCR. ", + "-p", spec.prompt, "--chat-template", "deepseek-ocr", "--temp", "0", "--flash-attn", "off", # match the HF "eager" attention reference "--no-warmup", - "-n", "512", # cap loops on hard images (KV would otherwise fill) + "-n", str(spec.n_predict), # cap loops on hard images (KV would otherwise fill) # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY. # Default DRY breakers include "\n", so they are cleared below. "--dry-multiplier", "0.8", @@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: "--dry-penalty-last-n", "-1", "--dry-sequence-breaker", "none", ] + if spec.n_ctx is not None: + cmd += ["-c", str(spec.n_ctx)] logger.debug(f" command: {' '.join(cmd)}") try: @@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") output = result.stdout.decode("utf-8", errors="replace").strip() + if spec.strip_grounding: + output = strip_grounding(output) if not output: raise RuntimeError("llama-mtmd-cli produced no output on stdout") logger.info(f" output: {len(output)} chars") @@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool: logger.info("") logger.info("=" * 60) - logger.info("Free OCR evaluation:") + logger.info("OCR evaluation:") logger.info("=" * 60) logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})") logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})") @@ -269,9 +311,9 @@ def main() -> int: expected = read_expected_text(ground_truth) logger.info(f" Image: {case.image}") logger.info(f" Expected text: {len(expected)} chars") - logger.info(" Running llama.cpp 'Free OCR'") + logger.info(f" Running llama.cpp prompt {model_spec.prompt!r}") try: - ocr_out = run_mtmd_cli(model, mmproj, image, binary) + ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary) except RuntimeError as e: logger.error(f" Error: {e}") results[title] = False