feat: add --threads-mtmd for independent multimodal thread count (#1797)

Add `-tm` / `--threads-mtmd` to control CPU thread count used during multimodal image/audio processing (mmproj encoding), separate from the main LLM thread count. This allows running the LLM on GPU with minimal CPU threads (e.g. `-t 1`) to reduce sync overhead, while using many threads (e.g. `-tm 16`) for CPU-bound mmproj encoding with `--no-mmproj-offload`. Fallback chain when `-tm` is not specified: 1. `--threads-batch` (-tb) — multimodal encoding is a batch/prefill-like operation, so it makes sense to track with batch thread count 2. `--threads` (-t) — final default Works with both mtmd-cli and llama-server. AI: ubergarm/Qwen3.6-27B-GGUF MTP IQ4_KS 15.113 GiB (4.752 BPW) + pi.dev
2026-06-28 04:30:15 -05:00 · 2026-05-13 10:44:43 -04:00 · 2026-05-13 10:44:43 -04:00 · ca52a825db
commit ca52a825db
parent 8a0f912cb2
4 changed files with 20 additions and 3 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -618,6 +618,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
+    if (arg == "-tm" || arg == "--threads-mtmd") {
+        CHECK_ARG
+        params.n_threads_mtmd = std::stoi(argv[i]);
+        if (params.n_threads_mtmd <= 0) {
+            params.n_threads_mtmd = std::thread::hardware_concurrency();
+        }
+        return true;
+    }
    if (arg == "-td" || arg == "--threads-draft") {
        CHECK_ARG
        params.speculative.n_threads = std::stoi(argv[i]);
@ -2461,6 +2469,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
+    options.push_back({ "multi-modality", "-tm,   --threads-mtmd N",    "number of threads to use during multimodal image processing (default: same as --threads-batch)" });
    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
@ -2896,6 +2905,9 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
    if (params.n_threads_batch != -1) {
        os << " (n_threads_batch = " << params.n_threads_batch << ")";
    }
+    if (params.n_threads_mtmd != -1) {
+        os << " (n_threads_mtmd = " << params.n_threads_mtmd << ")";
+    }
    os << " / " << std::thread::hardware_concurrency() << " | " << llama_print_system_info();

    return os.str();
@ -3021,7 +3033,7 @@ std::string string_lower(const std::string& str) {
    std::string result = str;
    for (char& c : result) {
        if (c >= 'A' && c <= 'Z') {
-            c = static_cast<char>(c + ('a' - 'A')); 
+            c = static_cast<char>(c + ('a' - 'A'));
        }
    }
    return result;
--- a/common/common.h
+++ b/common/common.h
@ -410,6 +410,7 @@ struct gpt_params {
    int image_min_tokens = -1;
    int image_max_tokens = -1;
    std::string mtmd_kq_type = "f32";
+    int32_t n_threads_mtmd = -1; // number of threads to use for multimodal processing (-1 = use n_threads_batch, then n_threads)

    // embedding
    bool embedding         = false; // get only sentence embedding
--- a/examples/mtmd/mtmd-cli.cpp
+++ b/examples/mtmd/mtmd-cli.cpp
@ -182,7 +182,9 @@ struct mtmd_cli_context {
        mtmd_context_params mparams = mtmd_context_params_default();
        mparams.use_gpu          = params.mmproj_use_gpu;
        mparams.print_timings    = true;
-        mparams.n_threads        = params.n_threads;
+        mparams.n_threads        = params.n_threads_mtmd != -1 ? params.n_threads_mtmd
+                                   : params.n_threads_batch != -1 ? params.n_threads_batch
+                                                                  : params.n_threads;
        mparams.verbosity        = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
        mparams.flash_attn_type = params.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
        mparams.image_min_tokens = params.image_min_tokens;
--- a/examples/server/server-context.cpp
+++ b/examples/server/server-context.cpp
@ -292,7 +292,9 @@ bool server_context::load_model(const gpt_params& params_) {
        mtmd_context_params mparams = mtmd_context_params_default();
        mparams.use_gpu = params_base.mmproj_use_gpu;
        mparams.print_timings = false;
-        mparams.n_threads = params_base.n_threads;
+        mparams.n_threads = params_base.n_threads_mtmd != -1 ? params_base.n_threads_mtmd
+                             : params_base.n_threads_batch != -1 ? params_base.n_threads_batch
+                                                                 : params_base.n_threads;
        mparams.flash_attn_type = params_base.flash_attn ? LLAMA_FLASH_ATTN_TYPE_ENABLED : LLAMA_FLASH_ATTN_TYPE_DISABLED;
        mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
        mparams.image_min_tokens = params_base.image_min_tokens;