Bug fixes (#1940)

* Bug fixes * More
2026-06-28 04:30:15 -05:00 · 2026-06-10 07:45:49 +02:00 · 2026-06-10 07:45:49 +02:00 · 366e478cb6
commit 366e478cb6
parent 2768b62515
6 changed files with 8 additions and 7 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -5145,7 +5145,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l

    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
-    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
+    yaml_dump_string_multiline(stream, "in_suffix", params.input_suffix.c_str());
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -64,10 +64,10 @@ static bool common_speculative_are_compatible(
    const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
    const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);

-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);

-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const auto vocab_type_dft = llama_vocab_type(vocab_dft);
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);

    if (vocab_type_tgt != vocab_type_dft) {
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@ -6426,7 +6426,7 @@ void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //

 void quantize_row_q2_k_r4_ref(const float * x, block_q2_k_r4 * y, int64_t k) {
-    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
+    quantize_q2_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }

 void quantize_row_q2_k_r4(const float * x, void * y, int64_t k) {
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@ -1416,7 +1416,7 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc
    for (const auto & reject : rejects) {
        candidates->data[reject.index].logit = -INFINITY;
    }
-    if (!smpl) {
+    if (smpl) {
        smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
 }
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@ -277,6 +277,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
        }
    }

+    if (min_keep < 1) min_keep = 1;
    float cum_sum = 0.0f;
    size_t last_idx = candidates->size;
    for (size_t i = 0; i < second_derivatives.size(); ++i) {
@ -337,7 +338,7 @@ void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_ar
        cum_sum += candidates->data[idx].p;

        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
+        if (cum_sum > p && i + 1 >= min_keep) {
            last_idx = i + 1;
            break;
        }
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -3616,7 +3616,7 @@ static bool llm_load_tensors(
                    LLAMA_LOG_ERROR("Not enough memory in device %d to offload the output layer\n", id);
                    throw std::runtime_error("Unable to auto-fit model");
                }
-                device_mem[id] -= layer_sizes[id];
+                device_mem[id] -= layer_sizes[n_layer];
                if (!tensor_split) {
                    float sum = 0;
                    for (int id = 0; id < int(model.splits.size()); ++id) {