Merge pull request #1972 from ikawrakow/ik/minimaxm3_smgraph

Split mode graph for MiniMax-M3
2026-06-28 04:30:15 -05:00 · 2026-06-15 13:44:19 +02:00 · 2026-06-15 13:44:19 +02:00 · f81673c7db
commit f81673c7db
parent e927adc4ad c24d50dd88
8 changed files with 43 additions and 18 deletions
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@ -3488,9 +3488,16 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
        }
    }

-    //printf("%s: using limit = %g\n", __func__, limit);
-    ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
-                    (const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
+    auto unary_op = (ggml_unary_op)dst->op_params[0];
+    if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
+        ggml_swiglu_oai_cuda_f32((const float *)dst->data, (const float *)dst_up.get(),
+                (float *)dst->data, ggml_nelements(dst), dst->ne[0],  dst->ne[0],  dst->ne[0],
+                1.702f, 7.0f, stream);
+    } else {
+        //printf("%s: using limit = %g\n", __func__, limit);
+        ggml_fused_mul_unary(ctx, unary_op, ggml_nelements(dst),
+                (const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
+    }
    CUDA_CHECK(cudaGetLastError());

 }
--- a/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
+++ b/ggml/src/ggml-cuda/iqk_mmvq_templates.cuh
@ -190,6 +190,13 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
            float u = tmp_u[j][threadIdx.x];
            float g = tmp_g[j][threadIdx.x];
            float r;
+            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
+                constexpr float alpha = 1.702f;
+                constexpr float limit = 7.0f;
+                g = fminf(g, limit);
+                u = fmaxf(fminf(u, limit), -limit);
+                r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
+            } else {
            switch (unary_op) {
                case GGML_UNARY_OP_SILU:
                    {
@ -214,6 +221,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
                    r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
                } break;
            }
+            }
            dst[j*nrows_dst + row0 + threadIdx.x] = r;
        }
    }
--- a/ggml/src/ggml-cuda/mmvq-templates.cuh
+++ b/ggml/src/ggml-cuda/mmvq-templates.cuh
@ -242,6 +242,13 @@ static __device__ void k_fused_mul_mat_vec_q(
            float u = tmp_u[j][threadIdx.x];
            float g = tmp_g[j][threadIdx.x];
            float r;
+            if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
+                constexpr float alpha = 1.702f;
+                constexpr float limit = 7.0f;
+                g = fminf(g, limit);
+                u = fmaxf(fminf(u, limit), -limit);
+                r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
+            } else {
            switch (unary_op) {
                case GGML_UNARY_OP_SILU:
                    {
@ -266,6 +273,7 @@ static __device__ void k_fused_mul_mat_vec_q(
                  r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
                } break;
            }
+            }
            dst[j*nrows_dst + row0 + threadIdx.x] = r;
        }
    }
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@ -262,7 +262,8 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
    if (!bias_u && !bias_g) {
        GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
                    unary_op == GGML_UNARY_OP_RELU ||
-                    unary_op == GGML_UNARY_OP_GELU);
+                    unary_op == GGML_UNARY_OP_GELU ||
+                    unary_op == GGML_UNARY_OP_SWIGLU_OAI);
    } else {
        GGML_ASSERT(unary_op == GGML_UNARY_OP_SWIGLU_OAI);
        GGML_ASSERT(bias_u && bias_g);
--- a/src/graphs/build_minimaxm3.cpp
+++ b/src/graphs/build_minimaxm3.cpp
@ -47,7 +47,7 @@ ggml_cgraph* llm_build_context::build_minimaxm3() {
                    model.layers[il].ffn_down_shexp,
                    nullptr,
                    n_expert, n_expert_used,
-                    LLM_FFN_SWIGLU_OAI_MOE,
+                    LLM_FFN_SWIGLU_OAI,
                    hparams.expert_weights_norm,
                    hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale,
                    (llm_expert_gating_func_type) hparams.expert_gating_func,
--- a/src/graphs/build_openai.cpp
+++ b/src/graphs/build_openai.cpp
@ -43,9 +43,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
                nullptr,
                nullptr,  nullptr, nullptr,  nullptr, nullptr,  nullptr, // no shared experts
                n_expert, n_expert_used,
-                LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
+                LLM_FFN_SWIGLU_OAI, false, false, 0.0f,
                LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
-                LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
+                LLM_FFN_SWIGLU_OAI, cb, il, gf, true,
                model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);

        cur = lctx.cvec.apply_to(ctx0, cur, il);
--- a/src/llama-build-context.cpp
+++ b/src/llama-build-context.cpp
@ -755,9 +755,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(

    if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
        up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
-        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
+        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
        auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
-                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
+                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
+                        type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
        auto u = (ggml_split_tensor_t *)up->extra;
        auto g = (ggml_split_tensor_t *)gate->extra;
        auto d = (ggml_split_tensor_t *)down->extra;
@ -833,9 +834,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(

    if (lctx.cparams.fused_up_gate &&
        up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR &&
-        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
+        (type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
        auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
-                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
+                        type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
+                        type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
        cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
        cb(cur, "ffn_up_gate", il);
        if (lctx.model.arch == LLM_ARCH_STEP35) {
@ -1153,16 +1155,16 @@ llm_expert_gating_func_type   gating_op,
    // Hence, if we have biases, we cannot use fmoe.
    //
    //bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
-    bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
+    bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI);

    ggml_tensor * par;
    if (can_use_fmoe && up_gate_exps) {
-        if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
            par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
                    type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
        } else {
-            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
            par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
        }
@ -1173,12 +1175,12 @@ llm_expert_gating_func_type   gating_op,
    GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);

    if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
-        if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
            par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
                    type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
        } else {
-            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
+            GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
            par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
                    type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
        }
@ -1213,7 +1215,7 @@ llm_expert_gating_func_type   gating_op,
            if (lctx.model.arch == LLM_ARCH_STEP35) {
                *((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
            }
-        } else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
+        } else if (type_op == LLM_FFN_SWIGLU_OAI) {
            constexpr float alpha = 1.702f;
            constexpr float limit = 7.0f;
            par = ggml_swiglu_oai(ctx, gate, up, alpha, limit);
--- a/src/llama-build-context.h
+++ b/src/llama-build-context.h
@ -25,7 +25,6 @@ enum llm_ffn_op_type {
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
    LLM_FFN_SWIGLU_OAI,
-    LLM_FFN_SWIGLU_OAI_MOE,
 };

 enum llm_ffn_gate_type {