diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 19b4d99f..76c81d9e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1764,89 +1764,89 @@ class ExpertGatingFuncType(IntEnum): # ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE. class LlamaFileType(IntEnum): ALL_F32 = 0 - MOSTLY_F16 = 1 # except 1d tensors - MOSTLY_Q4_0 = 2 # except 1d tensors - MOSTLY_Q4_1 = 3 # except 1d tensors - MOSTLY_Q8_0 = 7 # except 1d tensors - MOSTLY_Q5_0 = 8 # except 1d tensors - MOSTLY_Q5_1 = 9 # except 1d tensors - MOSTLY_Q2_K = 10 # except 1d tensors - MOSTLY_Q3_K_S = 11 # except 1d tensors - MOSTLY_Q3_K_M = 12 # except 1d tensors - MOSTLY_Q3_K_L = 13 # except 1d tensors - MOSTLY_Q4_K_S = 14 # except 1d tensors - MOSTLY_Q4_K_M = 15 # except 1d tensors - MOSTLY_Q5_K_S = 16 # except 1d tensors - MOSTLY_Q5_K_M = 17 # except 1d tensors - MOSTLY_Q6_K = 18 # except 1d tensors - MOSTLY_IQ2_XXS = 19 # except 1d tensors - MOSTLY_IQ2_XS = 20 # except 1d tensors - MOSTLY_Q2_K_S = 21 # except 1d tensors - MOSTLY_IQ3_XS = 22 # except 1d tensors - MOSTLY_IQ3_XXS = 23 # except 1d tensors - MOSTLY_IQ1_S = 24 # except 1d tensors - MOSTLY_IQ4_NL = 25 # except 1d tensors - MOSTLY_IQ3_S = 26 # except 1d tensors - MOSTLY_IQ3_M = 27 # except 1d tensors - MOSTLY_IQ2_S = 28 # except 1d tensors - MOSTLY_IQ2_M = 29 # except 1d tensors - MOSTLY_IQ4_XS = 30 # except 1d tensors - MOSTLY_IQ1_M = 31 # except 1d tensors - MOSTLY_BF16 = 32 # except 1d tensors - MOSTLY_Q4_0_4_4 = 33 # except 1d tensors - MOSTLY_Q4_0_4_8 = 34 # except 1d tensors - MOSTLY_Q4_0_8_8 = 35 # except 1d tensors - MOSTLY_MXFP4 = 38 # except 1d tensors, 38 to be compatible with mainline + MOSTLY_F16 = 1 #except 1d tensors + MOSTLY_Q4_0 = 2 #except 1d tensors + MOSTLY_Q4_1 = 3 #except 1d tensors + MOSTLY_Q8_0 = 7 #except 1d tensors + MOSTLY_Q5_0 = 8 #except 1d tensors + MOSTLY_Q5_1 = 9 #except 1d tensors + MOSTLY_Q2_K = 10 #except 1d tensors + MOSTLY_Q3_K_S = 11 #except 1d tensors + MOSTLY_Q3_K_M = 12 #except 1d tensors + MOSTLY_Q3_K_L = 13 #except 1d tensors + MOSTLY_Q4_K_S = 14 #except 1d tensors + MOSTLY_Q4_K_M = 15 #except 1d tensors + MOSTLY_Q5_K_S = 16 #except 1d tensors + MOSTLY_Q5_K_M = 17 #except 1d tensors + MOSTLY_Q6_K = 18 #except 1d tensors + MOSTLY_IQ2_XXS = 19 #except 1d tensors + MOSTLY_IQ2_XS = 20 #except 1d tensors + MOSTLY_Q2_K_S = 21 #except 1d tensors + MOSTLY_IQ3_XS = 22 #except 1d tensors + MOSTLY_IQ3_XXS = 23 #except 1d tensors + MOSTLY_IQ1_S = 24 #except 1d tensors + MOSTLY_IQ4_NL = 25 #except 1d tensors + MOSTLY_IQ3_S = 26 #except 1d tensors + MOSTLY_IQ3_M = 27 #except 1d tensors + MOSTLY_IQ2_S = 28 #except 1d tensors + MOSTLY_IQ2_M = 29 #except 1d tensors + MOSTLY_IQ4_XS = 30 #except 1d tensors + MOSTLY_IQ1_M = 31 #except 1d tensors + MOSTLY_BF16 = 32 #except 1d tensors + MOSTLY_Q4_0_4_4 = 33 #except 1d tensors + MOSTLY_Q4_0_4_8 = 34 #except 1d tensors + MOSTLY_Q4_0_8_8 = 35 #except 1d tensors + MOSTLY_MXFP4 = 38 #except 1d tensors, 38 to be compatible with mainline - MOSTLY_Q6_0 = 135 # except 1d tensors - MOSTLY_IQ1_BN = 136 # except 1d tensors - MOSTLY_IQ2_BN = 137 # except 1d tensors - MOSTLY_IQ2_K = 138 # except 1d tensors - MOSTLY_IQ3_K = 139 # except 1d tensors - MOSTLY_IQ4_K = 140 # except 1d tensors - MOSTLY_IQ5_K = 141 # except 1d tensors - MOSTLY_IQ6_K = 142 # except 1d tensors - MOSTLY_IQ4_KS = 145 # except 1d tensors - MOSTLY_IQ3_KL = 146 # except 1d tensors - MOSTLY_IQ2_KS = 147 # except 1d tensors - MOSTLY_IQ4_KSS = 148 # except 1d tensors - MOSTLY_Q8_KV = 149 # except 1d tensors - MOSTLY_IQ5_KS = 150 # except 1d tensors - MOSTLY_IQ2_KT = 151 # except 1d tensors - MOSTLY_IQ3_KT = 152 # except 1d tensors - MOSTLY_IQ4_KT = 153 # except 1d tensors - MOSTLY_IQ3_KS = 154 # except 1d tensors - MOSTLY_IQ2_KL = 155 # except 1d tensors - MOSTLY_IQ1_KT = 156 # except 1d tensors + MOSTLY_Q6_0 = 135 #except 1d tensors + MOSTLY_IQ1_BN = 136 #except 1d tensors + MOSTLY_IQ2_BN = 137 #except 1d tensors + MOSTLY_IQ2_K = 138 #except 1d tensors + MOSTLY_IQ3_K = 139 #except 1d tensors + MOSTLY_IQ4_K = 140 #except 1d tensors + MOSTLY_IQ5_K = 141 #except 1d tensors + MOSTLY_IQ6_K = 142 #except 1d tensors + MOSTLY_IQ4_KS = 145 #except 1d tensors + MOSTLY_IQ3_KL = 146 #except 1d tensors + MOSTLY_IQ2_KS = 147 #except 1d tensors + MOSTLY_IQ4_KSS = 148 #except 1d tensors + MOSTLY_Q8_KV = 149 #except 1d tensors + MOSTLY_IQ5_KS = 150 #except 1d tensors + MOSTLY_IQ2_KT = 151 #except 1d tensors + MOSTLY_IQ3_KT = 152 #except 1d tensors + MOSTLY_IQ4_KT = 153 #except 1d tensors + MOSTLY_IQ3_KS = 154 #except 1d tensors + MOSTLY_IQ2_KL = 155 #except 1d tensors + MOSTLY_IQ1_KT = 156 #except 1d tensors - MOSTLY_Q4_0_R8 = 202 # except 1d tensors - MOSTLY_Q8_0_R8 = 207 # except 1d tensors - MOSTLY_Q5_0_R4 = 208 # except 1d tensors - MOSTLY_Q2_K_R4 = 210 # except 1d tensors - MOSTLY_Q3_K_R4 = 211 # except 1d tensors - MOSTLY_Q4_K_R4 = 214 # except 1d tensors - MOSTLY_Q5_K_R4 = 216 # except 1d tensors - MOSTLY_Q6_K_R4 = 218 # except 1d tensors - MOSTLY_IQ2_XXS_R4 = 219 # except 1d tensors - MOSTLY_IQ2_XS_R4 = 220 # except 1d tensors - MOSTLY_IQ3_XXS_R4 = 223 # except 1d tensors - MOSTLY_IQ1_S_R4 = 224 # except 1d tensors - MOSTLY_IQ4_NL_R4 = 225 # except 1d tensors - MOSTLY_IQ3_S_R4 = 226 # except 1d tensors - MOSTLY_IQ2_M_R4 = 229 # except 1d tensors - MOSTLY_IQ4_XS_R8 = 230 # except 1d tensors - MOSTLY_IQ1_M_R4 = 231 # except 1d tensors - MOSTLY_Q6_0_R4 = 335 # except 1d tensors - MOSTLY_BF16_R16 = 232 # except 1d tensors - MOSTLY_IQ2_BN_R4 = 337 # except 1d tensors - MOSTLY_IQ2_K_R4 = 338 # except 1d tensors - MOSTLY_IQ3_K_R4 = 339 # except 1d tensors - MOSTLY_IQ4_K_R4 = 340 # except 1d tensors - MOSTLY_IQ5_K_R4 = 341 # except 1d tensors - MOSTLY_IQ4_KS_R4 = 345 # except 1d tensors - MOSTLY_IQ5_KS_R4 = 350 # except 1d tensors - MOSTLY_Q8_KV_R8 = 398 # except 1d tensors - MOSTLY_Q8_K_R8 = 399 # except 1d tensors + MOSTLY_Q4_0_R8 = 202 #except 1d tensors + MOSTLY_Q8_0_R8 = 207 #except 1d tensors + MOSTLY_Q5_0_R4 = 208 #except 1d tensors + MOSTLY_Q2_K_R4 = 210 #except 1d tensors + MOSTLY_Q3_K_R4 = 211 #except 1d tensors + MOSTLY_Q4_K_R4 = 214 #except 1d tensors + MOSTLY_Q5_K_R4 = 216 #except 1d tensors + MOSTLY_Q6_K_R4 = 218 #except 1d tensors + MOSTLY_IQ2_XXS_R4 = 219 #except 1d tensors + MOSTLY_IQ2_XS_R4 = 220 #except 1d tensors + MOSTLY_IQ3_XXS_R4 = 223 #except 1d tensors + MOSTLY_IQ1_S_R4 = 224 #except 1d tensors + MOSTLY_IQ4_NL_R4 = 225 #except 1d tensors + MOSTLY_IQ3_S_R4 = 226 #except 1d tensors + MOSTLY_IQ2_M_R4 = 229 #except 1d tensors + MOSTLY_IQ4_XS_R8 = 230 #except 1d tensors + MOSTLY_IQ1_M_R4 = 231 #except 1d tensors + MOSTLY_Q6_0_R4 = 335 #except 1d tensors + MOSTLY_BF16_R16 = 232 #except 1d tensors + MOSTLY_IQ2_BN_R4 = 337 #except 1d tensors + MOSTLY_IQ2_K_R4 = 338 #except 1d tensors + MOSTLY_IQ3_K_R4 = 339 #except 1d tensors + MOSTLY_IQ4_K_R4 = 340 #except 1d tensors + MOSTLY_IQ5_K_R4 = 341 #except 1d tensors + MOSTLY_IQ4_KS_R4 = 345 #except 1d tensors + MOSTLY_IQ5_KS_R4 = 350 #except 1d tensors + MOSTLY_Q8_KV_R8 = 398 #except 1d tensors + MOSTLY_Q8_K_R8 = 399 #except 1d tensors GUESSED = 1024 # not specified in the model file @@ -1891,7 +1891,7 @@ class GGUFValueType(IntEnum): # Items here are (block size, type size) QK_K = 256 -# Values generated programatically +#Values generated programatically GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F32 : ( 1, 4), GGMLQuantizationType.F16 : ( 1, 2), diff --git a/src/llama-dflash.cpp b/src/llama-dflash.cpp index bfb4595b..277a6ffd 100644 --- a/src/llama-dflash.cpp +++ b/src/llama-dflash.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include void llama_sync_dflash_workspace_if_pending(struct llama_context & lctx) { @@ -70,15 +71,15 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) { const int64_t n_embd_head_v = model.hparams.n_embd_head_v(0); const int64_t n_head_kv = model.hparams.n_head_kv(); - if (dflash.kv.cache_ctx != nullptr && !dflash.kv.k_ctx_cache.empty()) { - const bool cache_matches = (int32_t) dflash.kv.k_ctx_cache.size() == n_layer && - dflash.kv.k_ctx_cache.front() != nullptr && - (int32_t) dflash.kv.k_ctx_cache.front()->ne[2] == target_cross_ctx; - const bool workspace_matches = (int32_t) dflash.kv.k_ctx_workspace.size() == n_layer && - dflash.kv.k_ctx_workspace.front() != nullptr && - (int32_t) dflash.kv.k_ctx_workspace.front()->ne[1] == target_workspace_n_kv_total; + if (dflash.kv.cache_ctx != nullptr && + (int32_t) dflash.kv.k_ctx_cache.size() == n_layer && + (int32_t) dflash.kv.k_ctx_workspace.size() == n_layer) { + const bool cache_matches = + (int32_t) dflash.kv.k_ctx_cache.front()->ne[2] == target_cross_ctx; + const bool workspace_matches = + (int32_t) dflash.kv.k_ctx_workspace.front()->ne[1] == target_workspace_n_kv_total; - if (cache_matches && workspace_matches) { + if (cache_matches && workspace_matches) { return true; } @@ -98,8 +99,6 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) { dflash.kv.workspace_graph_rows = 0; dflash.kv.workspace_graph_write_pos = 0; dflash.kv.workspace_reserved_rows = 0; - dflash.kv.cache_compute_meta.clear(); - dflash.kv.workspace_compute_meta.clear(); } ggml_init_params params = { @@ -110,6 +109,7 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) { dflash.kv.cache_ctx = ggml_init(params); if (dflash.kv.cache_ctx == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate DFlash K/V cache context\n", __func__); return false; } @@ -123,74 +123,44 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) { dflash.kv.cache_bufs.reserve((size_t) std::max(1, n_layer) * 4); for (int32_t il = 0; il < n_layer; ++il) { ggml_backend_buffer_type_t layer_buft = llama_dflash_kv_cache_layer_buft(*this, il); + auto alloc_kv_input = [&](ggml_tensor *& tensor, const char * tensor_tag, const char * tensor_name, + int64_t ne0, int64_t ne1, int64_t ne2) -> bool { + tensor = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, ne0, ne1, ne2); + if (tensor == nullptr) { + LLAMA_LOG_ERROR("%s: failed to create %s for layer %d\n", __func__, tensor_tag, il); + return false; + } - dflash.kv.k_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_k, n_head_kv, target_cross_ctx); - dflash.kv.v_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_v, n_head_kv, target_cross_ctx); - if (dflash.kv.k_ctx_cache[(size_t) il] == nullptr || dflash.kv.v_ctx_cache[(size_t) il] == nullptr) { + ggml_set_input(tensor); + ggml_format_name(tensor, tensor_name, il); + + const size_t tensor_bytes = ggml_backend_buft_get_alloc_size(layer_buft, tensor); + ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer(layer_buft, tensor_bytes); + if (buf == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate %s buffer for layer %d (%zu bytes)\n", + __func__, tensor_tag, il, tensor_bytes); + return false; + } + + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE); + ggml_backend_tensor_alloc(buf, tensor, ggml_backend_buffer_get_base(buf)); + ggml_backend_buffer_clear(buf, 0); + dflash.kv.cache_bufs.push_back(buf); + + return true; + }; + + if (!alloc_kv_input(dflash.kv.k_ctx_cache[(size_t) il], "dflash_k_ctx_cache", "dflash_k_ctx_cache_%d", + n_embd_head_k, n_head_kv, target_cross_ctx) || + !alloc_kv_input(dflash.kv.v_ctx_cache[(size_t) il], "dflash_v_ctx_cache", "dflash_v_ctx_cache_%d", + n_embd_head_v, n_head_kv, target_cross_ctx) || + !alloc_kv_input(dflash.kv.k_ctx_workspace[(size_t) il], "dflash_k_ctx_workspace", "dflash_k_ctx_workspace_%d", + n_embd_head_k, target_workspace_n_kv_total, n_head_kv) || + !alloc_kv_input(dflash.kv.v_ctx_workspace[(size_t) il], "dflash_v_ctx_workspace", "dflash_v_ctx_workspace_%d", + n_embd_head_v, target_workspace_n_kv_total, n_head_kv)) { free_dflash_kv_cache_tensors(); return false; } - - ggml_set_input(dflash.kv.k_ctx_cache[(size_t) il]); - ggml_set_input(dflash.kv.v_ctx_cache[(size_t) il]); - ggml_format_name(dflash.kv.k_ctx_cache[(size_t) il], "dflash_k_ctx_cache_%d", il); - ggml_format_name(dflash.kv.v_ctx_cache[(size_t) il], "dflash_v_ctx_cache_%d", il); - - const size_t k_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.k_ctx_cache[(size_t) il]); - ggml_backend_buffer_t k_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_bytes); - if (k_buf == nullptr) { - free_dflash_kv_cache_tensors(); - return false; - } - ggml_backend_buffer_set_usage(k_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE); - ggml_backend_tensor_alloc(k_buf, dflash.kv.k_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(k_buf)); - ggml_backend_buffer_clear(k_buf, 0); - dflash.kv.cache_bufs.push_back(k_buf); - - const size_t v_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.v_ctx_cache[(size_t) il]); - ggml_backend_buffer_t v_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_bytes); - if (v_buf == nullptr) { - free_dflash_kv_cache_tensors(); - return false; - } - ggml_backend_buffer_set_usage(v_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE); - ggml_backend_tensor_alloc(v_buf, dflash.kv.v_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(v_buf)); - ggml_backend_buffer_clear(v_buf, 0); - dflash.kv.cache_bufs.push_back(v_buf); - - dflash.kv.k_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_k, target_workspace_n_kv_total, n_head_kv); - dflash.kv.v_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_v, target_workspace_n_kv_total, n_head_kv); - if (dflash.kv.k_ctx_workspace[(size_t) il] == nullptr || dflash.kv.v_ctx_workspace[(size_t) il] == nullptr) { - free_dflash_kv_cache_tensors(); - return false; - } - - ggml_set_input(dflash.kv.k_ctx_workspace[(size_t) il]); - ggml_set_input(dflash.kv.v_ctx_workspace[(size_t) il]); - ggml_format_name(dflash.kv.k_ctx_workspace[(size_t) il], "dflash_k_ctx_workspace_%d", il); - ggml_format_name(dflash.kv.v_ctx_workspace[(size_t) il], "dflash_v_ctx_workspace_%d", il); - - const size_t k_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.k_ctx_workspace[(size_t) il]); - ggml_backend_buffer_t k_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_workspace_bytes); - if (k_workspace_buf == nullptr) { - free_dflash_kv_cache_tensors(); - return false; - } - ggml_backend_buffer_set_usage(k_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE); - ggml_backend_tensor_alloc(k_workspace_buf, dflash.kv.k_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(k_workspace_buf)); - ggml_backend_buffer_clear(k_workspace_buf, 0); - dflash.kv.cache_bufs.push_back(k_workspace_buf); - - const size_t v_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.v_ctx_workspace[(size_t) il]); - ggml_backend_buffer_t v_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_workspace_bytes); - if (v_workspace_buf == nullptr) { - free_dflash_kv_cache_tensors(); - return false; - } - ggml_backend_buffer_set_usage(v_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE); - ggml_backend_tensor_alloc(v_workspace_buf, dflash.kv.v_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(v_workspace_buf)); - ggml_backend_buffer_clear(v_workspace_buf, 0); - dflash.kv.cache_bufs.push_back(v_workspace_buf); } dflash.kv.workspace_token_capacity = target_token_capacity; @@ -201,10 +171,15 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) { } void llama_context::free_dflash_kv_cache_tensors() { - dflash.kv.k_ctx_cache.clear(); - dflash.kv.v_ctx_cache.clear(); - dflash.kv.k_ctx_workspace.clear(); - dflash.kv.v_ctx_workspace.clear(); + auto release_vector = [](auto & v) { + using vec_type = std::decay_t; + vec_type().swap(v); + }; + + release_vector(dflash.kv.k_ctx_cache); + release_vector(dflash.kv.v_ctx_cache); + release_vector(dflash.kv.k_ctx_workspace); + release_vector(dflash.kv.v_ctx_workspace); dflash.kv.cache_write_pos = 0; dflash.kv.cache_n_filled = 0; dflash.kv.cache_update_rows = 0; @@ -244,7 +219,9 @@ void llama_context::free_dflash_kv_cache_tensors() { ggml_backend_buffer_free(buf); } } - dflash.kv.cache_bufs.clear(); + release_vector(dflash.kv.cache_bufs); + release_vector(dflash.kv.cache_compute_meta); + release_vector(dflash.kv.workspace_compute_meta); if (dflash.kv.cache_ctx != nullptr) { ggml_free(dflash.kv.cache_ctx); dflash.kv.cache_ctx = nullptr; diff --git a/src/llama-load-tensors.cpp b/src/llama-load-tensors.cpp index 88e8816b..0585f0ba 100644 --- a/src/llama-load-tensors.cpp +++ b/src/llama-load-tensors.cpp @@ -2257,10 +2257,14 @@ bool create_tensors_helper::create_dflash_tensors(const LLM_TN & tn) { model.tok_embd = create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); model.output_norm = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); model.output = create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); - model.output_mtp = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + auto output_extra = create_tensor(ctx_output, "output_extra.weight", {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + if (output_extra != nullptr) { + model.output = output_extra; + } if (model.output == nullptr && model.tok_embd != nullptr) { model.output = create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); } + model.output_mtp = model.output; model.dflash_fc = create_tensor(ctx_output, tn(LLM_TENSOR_DFLASH_FC, "weight"), {(int64_t) hparams.dflash_n_target_features, n_embd}, 0); model.dflash_hidden_norm = create_tensor(ctx_output, tn(LLM_TENSOR_DFLASH_HIDDEN_NORM, "weight"), {n_embd}, 0);