diff --git a/common/common.cpp b/common/common.cpp index 224f3e0d..09126756 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -5145,7 +5145,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str()); fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false"); - yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str()); + yaml_dump_string_multiline(stream, "in_suffix", params.input_suffix.c_str()); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); diff --git a/common/speculative.cpp b/common/speculative.cpp index 758202ac..c84573e9 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -64,10 +64,10 @@ static bool common_speculative_are_compatible( const llama_vocab * vocab_tgt = llama_model_get_vocab(model_tgt); const llama_vocab * vocab_dft = llama_model_get_vocab(model_dft); - const bool vocab_type_tgt = llama_vocab_type(vocab_tgt); + const auto vocab_type_tgt = llama_vocab_type(vocab_tgt); LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt); - const bool vocab_type_dft = llama_vocab_type(vocab_dft); + const auto vocab_type_dft = llama_vocab_type(vocab_dft); LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft); if (vocab_type_tgt != vocab_type_dft) { diff --git a/ggml/src/iqk/iqk_quantize.cpp b/ggml/src/iqk/iqk_quantize.cpp index 40df1ea6..b049aa58 100644 --- a/ggml/src/iqk/iqk_quantize.cpp +++ b/ggml/src/iqk/iqk_quantize.cpp @@ -6426,7 +6426,7 @@ void vec_dot_q3_k_r4_q8_k(int n, float * s, size_t bs, const void * vx, size_t b // void quantize_row_q2_k_r4_ref(const float * x, block_q2_k_r4 * y, int64_t k) { - quantize_q3_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); + quantize_q2_k_r4(x, (void *)y, 4, k/4, nullptr, nullptr); } void quantize_row_q2_k_r4(const float * x, void * y, int64_t k) { diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index e0593ef5..eaa75424 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -1416,7 +1416,7 @@ void llama_grammar_sample_impl(const struct llama_grammar * grammar, const struc for (const auto & reject : rejects) { candidates->data[reject.index].logit = -INFINITY; } - if (!smpl) { + if (smpl) { smpl->t_sample_us += ggml_time_us() - t_start_sample_us; } } diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index 8312a4e8..db0aa6b2 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -277,6 +277,7 @@ void llama_sample_tail_free_impl(struct llama_sampling * smpl, llama_token_data_ } } + if (min_keep < 1) min_keep = 1; float cum_sum = 0.0f; size_t last_idx = candidates->size; for (size_t i = 0; i < second_derivatives.size(); ++i) { @@ -337,7 +338,7 @@ void llama_sample_typical_impl(struct llama_sampling * smpl, llama_token_data_ar cum_sum += candidates->data[idx].p; // Check if the running sum is greater than typical or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep - 1) { + if (cum_sum > p && i + 1 >= min_keep) { last_idx = i + 1; break; } diff --git a/src/llama.cpp b/src/llama.cpp index fa0f8c1c..7391255e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -3616,7 +3616,7 @@ static bool llm_load_tensors( LLAMA_LOG_ERROR("Not enough memory in device %d to offload the output layer\n", id); throw std::runtime_error("Unable to auto-fit model"); } - device_mem[id] -= layer_sizes[id]; + device_mem[id] -= layer_sizes[n_layer]; if (!tensor_split) { float sum = 0; for (int id = 0; id < int(model.splits.size()); ++id) {