diff --git a/src/llama-quantize.cpp b/src/llama-quantize.cpp index 1f538882..1e292e48 100644 --- a/src/llama-quantize.cpp +++ b/src/llama-quantize.cpp @@ -1250,8 +1250,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // - qs.n_attention_wv == 3 * model.hparams.n_layer for Encoder-Decoder models // - model.arch == LLM_ARCH_DECI for Deci-Nemotron models // - GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer || qs.n_attention_wv == 3 * (int)model.hparams.n_layer || - model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_UNKNOWN) && "n_attention_wv is unexpected"); + GGML_ASSERT((qs.n_attention_wv == 0 || + qs.n_attention_wv == (int)model.hparams.n_layer || + qs.n_attention_wv == 3 * (int)model.hparams.n_layer || + model.arch == LLM_ARCH_DECI || + model.arch == LLM_ARCH_GEMMA4 || + model.arch == LLM_ARCH_UNKNOWN) && "n_attention_wv is unexpected"); size_t total_size_org = 0; size_t total_size_new = 0;