Merge pull request #1972 from ikawrakow/ik/minimaxm3_smgraph

Split mode graph for MiniMax-M3
This commit is contained in:
Kawrakow 2026-06-15 13:44:19 +02:00 committed by GitHub
commit f81673c7db
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 43 additions and 18 deletions

View File

@ -3488,9 +3488,16 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
}
}
//printf("%s: using limit = %g\n", __func__, limit);
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
auto unary_op = (ggml_unary_op)dst->op_params[0];
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
ggml_swiglu_oai_cuda_f32((const float *)dst->data, (const float *)dst_up.get(),
(float *)dst->data, ggml_nelements(dst), dst->ne[0], dst->ne[0], dst->ne[0],
1.702f, 7.0f, stream);
} else {
//printf("%s: using limit = %g\n", __func__, limit);
ggml_fused_mul_unary(ctx, unary_op, ggml_nelements(dst),
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
}
CUDA_CHECK(cudaGetLastError());
}

View File

@ -190,6 +190,13 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
float u = tmp_u[j][threadIdx.x];
float g = tmp_g[j][threadIdx.x];
float r;
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
constexpr float alpha = 1.702f;
constexpr float limit = 7.0f;
g = fminf(g, limit);
u = fmaxf(fminf(u, limit), -limit);
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
} else {
switch (unary_op) {
case GGML_UNARY_OP_SILU:
{
@ -214,6 +221,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
} break;
}
}
dst[j*nrows_dst + row0 + threadIdx.x] = r;
}
}

View File

@ -242,6 +242,13 @@ static __device__ void k_fused_mul_mat_vec_q(
float u = tmp_u[j][threadIdx.x];
float g = tmp_g[j][threadIdx.x];
float r;
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
constexpr float alpha = 1.702f;
constexpr float limit = 7.0f;
g = fminf(g, limit);
u = fmaxf(fminf(u, limit), -limit);
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
} else {
switch (unary_op) {
case GGML_UNARY_OP_SILU:
{
@ -266,6 +273,7 @@ static __device__ void k_fused_mul_mat_vec_q(
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
} break;
}
}
dst[j*nrows_dst + row0 + threadIdx.x] = r;
}
}

View File

@ -262,7 +262,8 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
if (!bias_u && !bias_g) {
GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
unary_op == GGML_UNARY_OP_RELU ||
unary_op == GGML_UNARY_OP_GELU);
unary_op == GGML_UNARY_OP_GELU ||
unary_op == GGML_UNARY_OP_SWIGLU_OAI);
} else {
GGML_ASSERT(unary_op == GGML_UNARY_OP_SWIGLU_OAI);
GGML_ASSERT(bias_u && bias_g);

View File

@ -47,7 +47,7 @@ ggml_cgraph* llm_build_context::build_minimaxm3() {
model.layers[il].ffn_down_shexp,
nullptr,
n_expert, n_expert_used,
LLM_FFN_SWIGLU_OAI_MOE,
LLM_FFN_SWIGLU_OAI,
hparams.expert_weights_norm,
hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale,
(llm_expert_gating_func_type) hparams.expert_gating_func,

View File

@ -43,9 +43,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
nullptr,
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // no shared experts
n_expert, n_expert_used,
LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
LLM_FFN_SWIGLU_OAI, false, false, 0.0f,
LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
LLM_FFN_SWIGLU_OAI, cb, il, gf, true,
model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);
cur = lctx.cvec.apply_to(ctx0, cur, il);

View File

@ -755,9 +755,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
auto u = (ggml_split_tensor_t *)up->extra;
auto g = (ggml_split_tensor_t *)gate->extra;
auto d = (ggml_split_tensor_t *)down->extra;
@ -833,9 +834,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
if (lctx.cparams.fused_up_gate &&
up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR &&
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
cb(cur, "ffn_up_gate", il);
if (lctx.model.arch == LLM_ARCH_STEP35) {
@ -1153,16 +1155,16 @@ llm_expert_gating_func_type gating_op,
// Hence, if we have biases, we cannot use fmoe.
//
//bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI);
ggml_tensor * par;
if (can_use_fmoe && up_gate_exps) {
if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
} else {
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
}
@ -1173,12 +1175,12 @@ llm_expert_gating_func_type gating_op,
GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
} else {
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
}
@ -1213,7 +1215,7 @@ llm_expert_gating_func_type gating_op,
if (lctx.model.arch == LLM_ARCH_STEP35) {
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
}
} else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
} else if (type_op == LLM_FFN_SWIGLU_OAI) {
constexpr float alpha = 1.702f;
constexpr float limit = 7.0f;
par = ggml_swiglu_oai(ctx, gate, up, alpha, limit);

View File

@ -25,7 +25,6 @@ enum llm_ffn_op_type {
LLM_FFN_RELU_SQR,
LLM_FFN_SWIGLU,
LLM_FFN_SWIGLU_OAI,
LLM_FFN_SWIGLU_OAI_MOE,
};
enum llm_ffn_gate_type {