Fix and restrict NVFP4 edge-cases in llama-graph (#24331)

* Move post-GEMM MUL required for dequant b4 lora and bias add

see https://github.com/ggml-org/llama.cpp/pull/23484 :
1. For lora, I would presume we want fully dequantized values before
   doing the residuals, but this depends on how the LORAs were
generated. Literature tells me LORA happens post-mul but pre-bias add https://github.com/ggml-org/llama.cpp/pull/8332
2. For ModelOPT, bias-add should happen on [fully-dequantized
   values](b49f9b9e2d/modelopt/torch/quantization/backends/nvfp4_gemm.py (L59-L64))

* Restrict build_ffn for NVFP4 to supported combinations
This commit is contained in:
Oliver Simons 2026-06-16 11:52:38 +02:00 committed by GitHub
parent a1824902b5
commit 02810c7aa8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 61 additions and 47 deletions

View File

@ -1088,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm(
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@ -1106,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm(
res = ggml_add(ctx0, res, ab_cur);
}
if (w_s) {
res = ggml_mul(ctx0, res, w_s);
}
return res;
}
ggml_tensor * llm_graph_context::build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const {
ggml_tensor * ids,
ggml_tensor * w_s) const {
ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
if (w_s) {
const int64_t n_expert = w_s->ne[0];
const int64_t n_tokens = cur->ne[2];
ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, ids);
res = ggml_mul(ctx0, res, s);
}
for (const auto & lora : *loras) {
llama_adapter_lora_weight * lw = lora.first->get_weight(w);
if (lw == nullptr) {
@ -1269,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn(
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
int il) const {
// NVFP4 support is currently restricted to
// 1) LORA absence (*_s would be applied after LORA residual, which is incorrect)
// 2) bias absense (*_s would be applied after bias addition, which is incorrect)
// TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently)
auto has_lora = [this](ggml_tensor * w) {
if (!w) {
return false;
}
for (const auto & lora : *loras) {
if (lora.first->get_weight(w) != nullptr) {
return true;
}
}
return false;
};
GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4);
GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up));
GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate));
GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down));
ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur;
cb(tmp, "ffn_up", il);
@ -1627,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
if (gate_up_exps) {
// merged gate_up path: one mul_mat_id, then split into gate and up views
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens]
ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens]
cb(gate_up, "ffn_moe_gate_up", il);
if (up_exps_s) {
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
if (gate_up_exps_b) {
gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts);
cb(gate_up, "ffn_moe_gate_up_biased", il);
}
// apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused)
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
gate_up = ggml_mul(ctx0, gate_up, s);
cb(gate_up, "ffn_moe_gate_up_scaled", il);
}
const int64_t n_ff = gate_up->ne[0] / 2;
cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0);
cb(cur, "ffn_moe_gate", il);
@ -1651,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(up, "ffn_moe_up", il);
} else {
// separate gate and up path
up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
if (up_exps_s) {
cb(up, "ffn_moe_up_scaled", il);
}
if (up_exps_b) {
up = ggml_add_id(ctx0, up, up_exps_b, selected_experts);
cb(up, "ffn_moe_up_biased", il);
}
// apply per-expert scale2 to up
if (up_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
up = ggml_mul(ctx0, up, s);
cb(up, "ffn_moe_up_scaled", il);
}
if (gate_exps) {
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens]
cb(cur, "ffn_moe_gate", il);
} else {
cur = up;
}
if (gate_exps_s) {
cb(cur, "ffn_moe_gate_scaled", il);
}
if (gate_exps_b) {
cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts);
cb(cur, "ffn_moe_gate_biased", il);
}
// apply per-expert scale2 to gate
if (gate_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
cur = ggml_mul(ctx0, cur, s);
cb(cur, "ffn_moe_gate_scaled", il);
}
}
const bool has_gate = gate_exps || gate_up_exps;
@ -1759,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
if (down_exps_s) {
cb(experts, "ffn_moe_down_scaled", il);
}
if (down_exps_b) {
experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts);
cb(experts, "ffn_moe_down_biased", il);
}
// apply per-expert scale2 to down
if (down_exps_s) {
ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1);
s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1);
s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens]
experts = ggml_mul(ctx0, experts, s);
cb(experts, "ffn_moe_down_scaled", il);
}
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(experts, "ffn_moe_weighted", il);

View File

@ -853,11 +853,12 @@ struct llm_graph_context {
ggml_tensor * cur,
ggml_tensor * w_s = nullptr) const;
// do mat_mul_id, while optionally apply lora
// do mat_mul_id, while optionally apply lora and per-expert scale
ggml_tensor * build_lora_mm_id(
ggml_tensor * w, // ggml_tensor * as
ggml_tensor * cur, // ggml_tensor * b
ggml_tensor * ids) const;
ggml_tensor * ids,
ggml_tensor * w_s = nullptr) const;
ggml_tensor * build_norm(
ggml_tensor * cur,