diff --git a/src/graphs/build_dflash.cpp b/src/graphs/build_dflash.cpp index 273372e6..725d08e5 100644 --- a/src/graphs/build_dflash.cpp +++ b/src/graphs/build_dflash.cpp @@ -315,9 +315,11 @@ ggml_cgraph * llm_build_context::build_dflash() { hparams.attn_soft_cap ? hparams.f_attn_logit_softcapping : 0.0f); cb(cur, "flash_attn", il); ggml_build_forward_expand(gf, cur); - if (use_swa) { - cur->op_params[4] = hparams.n_swa; - } + // Somethiong goes wrong with thisi optimization. + // I guess, the cross context does not mingle well with it. + //if (use_swa) { + // cur->op_params[4] = hparams.n_swa; + //} cur = ggml_reshape_2d(ctx0, cur, model.layers[il].wo->ne[0], n_tokens); cb(cur, "flash_attn_reshaped", il);