mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2026-06-27 23:50:20 -05:00
cont : avoid computations on the CPU
This commit is contained in:
parent
d78a3864f0
commit
f0438b1b15
@ -767,8 +767,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
|
||||
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_INPUT, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_PRE_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
|
||||
{LLM_TENSOR_NEXTN_POST_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
|
||||
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
|
||||
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
|
||||
// the model loader doesn't fault on the block index.
|
||||
|
||||
@ -557,8 +557,8 @@ enum llm_tensor {
|
||||
LLM_TENSOR_INDEXER_PROJ,
|
||||
LLM_TENSOR_INDEXER_ATTN_K,
|
||||
LLM_TENSOR_INDEXER_ATTN_Q_B,
|
||||
LLM_TENSOR_NEXTN_PRE_PROJ,
|
||||
LLM_TENSOR_NEXTN_POST_PROJ,
|
||||
LLM_TENSOR_NEXTN_PRE_PROJ, // TODO: rename to PROJ_PRE
|
||||
LLM_TENSOR_NEXTN_POST_PROJ, // TODO: rename to PROJ_POST
|
||||
LLM_TENSOR_NEXTN_EH_PROJ,
|
||||
LLM_TENSOR_NEXTN_EMBED_TOKENS,
|
||||
LLM_TENSOR_NEXTN_ENORM,
|
||||
|
||||
@ -39,7 +39,6 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
|
||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
|
||||
|
||||
const int64_t n_embd_backbone = hparams.n_embd_out();
|
||||
nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight"), { 2*n_embd_backbone, n_embd }, 0);
|
||||
nextn_post_proj = create_tensor(tn(LLM_TENSOR_NEXTN_POST_PROJ, "weight"), { n_embd, n_embd_backbone }, 0);
|
||||
|
||||
int rope_freqs_flag = 0;
|
||||
@ -51,6 +50,10 @@ void llama_model_gemma4_assistant::load_arch_tensors(llama_model_loader &) {
|
||||
const int64_t n_embd_head = hparams.n_embd_head_k(i);
|
||||
const int64_t n_ff = hparams.n_ff(i);
|
||||
|
||||
if (i == 0) {
|
||||
nextn_pre_proj = create_tensor(tn(LLM_TENSOR_NEXTN_PRE_PROJ, "weight", i), { 2*n_embd_backbone, n_embd }, 0);
|
||||
}
|
||||
|
||||
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head*n_head }, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head*n_head, n_embd }, 0);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user