diff --git a/common/common.cpp b/common/common.cpp
index 224f3e0d..09126756 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -5145,7 +5145,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
 
     yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
     fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
-    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
+    yaml_dump_string_multiline(stream, "in_suffix", params.input_suffix.c_str());
     fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
     fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
     fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
diff --git a/common/speculative.cpp b/common/speculative.cpp
index 758202ac..c84573e9 100644
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -64,10 +64,10 @@ static bool common_speculative_are_compatible(
     const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt);
     const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft);
 
-    const bool vocab_type_tgt = llama_vocab_type(vocab_tgt);
+    const auto vocab_type_tgt = llama_vocab_type(vocab_tgt);
     LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
 
-    const bool vocab_type_dft = llama_vocab_type(vocab_dft);
+    const auto vocab_type_dft = llama_vocab_type(vocab_dft);
     LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
 
     if (vocab_type_tgt != vocab_type_dft) {
diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp
index 40df1ea6..b049aa58 100644
--- a/ggml/src/iqk/iqk_quantize.cpp
+++ b/ggml/src/iqk/iqk_quantize.cpp
@@ -6426,7 +6426,7 @@ void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b
 //
 
 void quantize_row_q2_k_r4_ref(const float * x, block_q2_k_r4 * y, int64_t k) {
-    quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
+    quantize_q2_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr);
 }
 
 void quantize_row_q2_k_r4(const float * x, void * y, int64_t k) {
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
index e0593ef5..eaa75424 100644
--- a/src/llama-grammar.cpp
+++ b/src/llama-grammar.cpp
@@ -1416,7 +1416,7 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc
     for (const auto & reject : rejects) {
         candidates->data[reject.index].logit = -INFINITY;
     }
-    if (!smpl) {
+    if (smpl) {
         smpl->t_sample_us += ggml_time_us() - t_start_sample_us;
     }
 }
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8312a4e8..db0aa6b2 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -277,6 +277,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_
         }
     }
 
+    if (min_keep < 1) min_keep = 1;
     float cum_sum = 0.0f;
     size_t last_idx = candidates->size;
     for (size_t i = 0; i < second_derivatives.size(); ++i) {
@@ -337,7 +338,7 @@ void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_ar
         cum_sum += candidates->data[idx].p;
 
         // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > p && i >= min_keep - 1) {
+        if (cum_sum > p && i + 1 >= min_keep) {
             last_idx = i + 1;
             break;
         }
diff --git a/src/llama.cpp b/src/llama.cpp
index fa0f8c1c..7391255e 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3616,7 +3616,7 @@ static bool llm_load_tensors(
                     LLAMA_LOG_ERROR("Not enough memory in device %d to offload the output layer\n", id);
                     throw std::runtime_error("Unable to auto-fit model");
                 }
-                device_mem[id] -= layer_sizes[id];
+                device_mem[id] -= layer_sizes[n_layer];
                 if (!tensor_split) {
                     float sum = 0;
                     for (int id = 0; id < int(model.splits.size()); ++id) {