mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
Merge pull request #1972 from ikawrakow/ik/minimaxm3_smgraph
Split mode graph for MiniMax-M3
This commit is contained in:
commit
f81673c7db
@ -3488,9 +3488,16 @@ static void ggml_cuda_up_gate_unary(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
}
|
||||
}
|
||||
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
ggml_fused_mul_unary(ctx, (ggml_unary_op)dst->op_params[0], ggml_nelements(dst),
|
||||
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
|
||||
auto unary_op = (ggml_unary_op)dst->op_params[0];
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI) {
|
||||
ggml_swiglu_oai_cuda_f32((const float *)dst->data, (const float *)dst_up.get(),
|
||||
(float *)dst->data, ggml_nelements(dst), dst->ne[0], dst->ne[0], dst->ne[0],
|
||||
1.702f, 7.0f, stream);
|
||||
} else {
|
||||
//printf("%s: using limit = %g\n", __func__, limit);
|
||||
ggml_fused_mul_unary(ctx, unary_op, ggml_nelements(dst),
|
||||
(const float *)dst->data, dst_up.get(), (float *)dst->data, limit);
|
||||
}
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
@ -190,6 +190,13 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
|
||||
float u = tmp_u[j][threadIdx.x];
|
||||
float g = tmp_g[j][threadIdx.x];
|
||||
float r;
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
|
||||
constexpr float alpha = 1.702f;
|
||||
constexpr float limit = 7.0f;
|
||||
g = fminf(g, limit);
|
||||
u = fmaxf(fminf(u, limit), -limit);
|
||||
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
|
||||
} else {
|
||||
switch (unary_op) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
@ -214,6 +221,7 @@ static __device__ void iqk_fused_mul_mat_vec_q_kernel(
|
||||
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
dst[j*nrows_dst + row0 + threadIdx.x] = r;
|
||||
}
|
||||
}
|
||||
|
||||
@ -242,6 +242,13 @@ static __device__ void k_fused_mul_mat_vec_q(
|
||||
float u = tmp_u[j][threadIdx.x];
|
||||
float g = tmp_g[j][threadIdx.x];
|
||||
float r;
|
||||
if (unary_op == GGML_UNARY_OP_SWIGLU_OAI && !bias_u) {
|
||||
constexpr float alpha = 1.702f;
|
||||
constexpr float limit = 7.0f;
|
||||
g = fminf(g, limit);
|
||||
u = fmaxf(fminf(u, limit), -limit);
|
||||
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
|
||||
} else {
|
||||
switch (unary_op) {
|
||||
case GGML_UNARY_OP_SILU:
|
||||
{
|
||||
@ -266,6 +273,7 @@ static __device__ void k_fused_mul_mat_vec_q(
|
||||
r = g / (1.0f + expf(-g * alpha)) * (1.0f + u);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
dst[j*nrows_dst + row0 + threadIdx.x] = r;
|
||||
}
|
||||
}
|
||||
|
||||
@ -262,7 +262,8 @@ void ggml_cuda_op_fused_mul_mat_vec_q_id(ggml_backend_cuda_context & ctx,
|
||||
if (!bias_u && !bias_g) {
|
||||
GGML_ASSERT(unary_op == GGML_UNARY_OP_SILU ||
|
||||
unary_op == GGML_UNARY_OP_RELU ||
|
||||
unary_op == GGML_UNARY_OP_GELU);
|
||||
unary_op == GGML_UNARY_OP_GELU ||
|
||||
unary_op == GGML_UNARY_OP_SWIGLU_OAI);
|
||||
} else {
|
||||
GGML_ASSERT(unary_op == GGML_UNARY_OP_SWIGLU_OAI);
|
||||
GGML_ASSERT(bias_u && bias_g);
|
||||
|
||||
@ -47,7 +47,7 @@ ggml_cgraph* llm_build_context::build_minimaxm3() {
|
||||
model.layers[il].ffn_down_shexp,
|
||||
nullptr,
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SWIGLU_OAI_MOE,
|
||||
LLM_FFN_SWIGLU_OAI,
|
||||
hparams.expert_weights_norm,
|
||||
hparams.expert_weights_scale != 0.0f, hparams.expert_weights_scale,
|
||||
(llm_expert_gating_func_type) hparams.expert_gating_func,
|
||||
|
||||
@ -43,9 +43,9 @@ ggml_cgraph * llm_build_context::build_openai_moe() {
|
||||
nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // no shared experts
|
||||
n_expert, n_expert_used,
|
||||
LLM_FFN_SWIGLU_OAI_MOE, false, false, 0.0f,
|
||||
LLM_FFN_SWIGLU_OAI, false, false, 0.0f,
|
||||
LLM_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT,
|
||||
LLM_FFN_SWIGLU_OAI_MOE, cb, il, gf, true,
|
||||
LLM_FFN_SWIGLU_OAI, cb, il, gf, true,
|
||||
model.layers[il].ffn_up_gate_exps, model.layers[il].ffn_up_gate_exps_b);
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
|
||||
@ -755,9 +755,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
|
||||
if (!up_b && !up_s && !gate_b && !gate_s && !down_b && !down_s &&
|
||||
up->extra && gate->extra && down->extra && type_gate == LLM_FFN_PAR &&
|
||||
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
|
||||
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
|
||||
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
|
||||
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
|
||||
auto u = (ggml_split_tensor_t *)up->extra;
|
||||
auto g = (ggml_split_tensor_t *)gate->extra;
|
||||
auto d = (ggml_split_tensor_t *)down->extra;
|
||||
@ -833,9 +834,10 @@ ggml_tensor * llm_build_context::llm_build_ffn(
|
||||
|
||||
if (lctx.cparams.fused_up_gate &&
|
||||
up && gate && !up_b && !up_s && !gate_b && !gate_s && type_gate == LLM_FFN_PAR &&
|
||||
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
|
||||
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || type_op == LLM_FFN_SWIGLU_OAI || (type_op == LLM_FFN_GELU && !act_scales))) {
|
||||
auto unary_op = type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU;
|
||||
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU :
|
||||
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI;
|
||||
cur = ggml_fused_up_gate(ctx, up, gate, cur, unary_op);
|
||||
cb(cur, "ffn_up_gate", il);
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
@ -1153,16 +1155,16 @@ llm_expert_gating_func_type gating_op,
|
||||
// Hence, if we have biases, we cannot use fmoe.
|
||||
//
|
||||
//bool can_use_fmoe = !up_exps_b && !gate_exps_b && (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU);
|
||||
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI_MOE);
|
||||
bool can_use_fmoe = (type_op == LLM_FFN_SILU || type_op == LLM_FFN_GELU || type_op == LLM_FFN_SWIGLU_OAI);
|
||||
|
||||
ggml_tensor * par;
|
||||
if (can_use_fmoe && up_gate_exps) {
|
||||
if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
|
||||
if (up_gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
|
||||
par = ggml_moe_up_gate_ext(ctx, up_gate_exps, nullptr, cur, selected_experts, up_gate_exps_b, nullptr,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
|
||||
} else {
|
||||
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
|
||||
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
|
||||
par = ggml_moe_up_gate(ctx, up_gate_exps, nullptr, cur, selected_experts,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
}
|
||||
@ -1173,12 +1175,12 @@ llm_expert_gating_func_type gating_op,
|
||||
GGML_ASSERT(!up_gate_exps && !up_gate_exps_b);
|
||||
|
||||
if (can_use_fmoe && lctx.cparams.fused_moe_up_gate && up_exps->type == gate_exps->type) {
|
||||
if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI_MOE) {
|
||||
if (up_exps_b || gate_exps_b || type_op == LLM_FFN_SWIGLU_OAI) {
|
||||
par = ggml_moe_up_gate_ext(ctx, up_exps, gate_exps, cur, selected_experts, up_exps_b, gate_exps_b,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
|
||||
type_op == LLM_FFN_GELU ? GGML_UNARY_OP_GELU : GGML_UNARY_OP_SWIGLU_OAI);
|
||||
} else {
|
||||
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI_MOE);
|
||||
GGML_ASSERT(type_op != LLM_FFN_SWIGLU_OAI);
|
||||
par = ggml_moe_up_gate(ctx, up_exps, gate_exps, cur, selected_experts,
|
||||
type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU : GGML_UNARY_OP_GELU);
|
||||
}
|
||||
@ -1213,7 +1215,7 @@ llm_expert_gating_func_type gating_op,
|
||||
if (lctx.model.arch == LLM_ARCH_STEP35) {
|
||||
*((float *)(par->op_params + 1)) = lctx.model.hparams.swiglu_limits[il];
|
||||
}
|
||||
} else if (type_op == LLM_FFN_SWIGLU_OAI_MOE) {
|
||||
} else if (type_op == LLM_FFN_SWIGLU_OAI) {
|
||||
constexpr float alpha = 1.702f;
|
||||
constexpr float limit = 7.0f;
|
||||
par = ggml_swiglu_oai(ctx, gate, up, alpha, limit);
|
||||
|
||||
@ -25,7 +25,6 @@ enum llm_ffn_op_type {
|
||||
LLM_FFN_RELU_SQR,
|
||||
LLM_FFN_SWIGLU,
|
||||
LLM_FFN_SWIGLU_OAI,
|
||||
LLM_FFN_SWIGLU_OAI_MOE,
|
||||
};
|
||||
|
||||
enum llm_ffn_gate_type {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user