mirror of
https://github.com/ikawrakow/ik_llama.cpp.git
synced 2026-06-28 04:30:15 -05:00
clean logs
This commit is contained in:
parent
0d75eee35a
commit
6cae8c7ba2
@ -1,11 +1,8 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <atomic>
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <sstream>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool common_speculative_are_dflash_compatible(
|
static bool common_speculative_are_dflash_compatible(
|
||||||
@ -71,102 +68,7 @@ static bool common_speculative_are_dflash_compatible(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool dflash_contract_log_enabled() {
|
|
||||||
const char * env = std::getenv("IK_DFLASH_CONTRACT_LOG");
|
|
||||||
if (env == nullptr || *env == '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::strcmp(env, "0") != 0 &&
|
|
||||||
std::strcmp(env, "false") != 0 &&
|
|
||||||
std::strcmp(env, "off") != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool dflash_stats_log_enabled() {
|
|
||||||
const char * env = std::getenv("IK_DFLASH_STATS_LOG");
|
|
||||||
if (env == nullptr || *env == '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::strcmp(env, "0") != 0 &&
|
|
||||||
std::strcmp(env, "false") != 0 &&
|
|
||||||
std::strcmp(env, "off") != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static std::string dflash_contract_format_values(
|
|
||||||
const std::vector<T> & values,
|
|
||||||
size_t edge_count = 4) {
|
|
||||||
std::ostringstream oss;
|
|
||||||
oss << '[';
|
|
||||||
if (values.empty()) {
|
|
||||||
oss << ']';
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t head = std::min(edge_count, values.size());
|
|
||||||
for (size_t i = 0; i < head; ++i) {
|
|
||||||
if (i > 0) {
|
|
||||||
oss << ',';
|
|
||||||
}
|
|
||||||
oss << values[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (values.size() > edge_count * 2) {
|
|
||||||
oss << ",...,";
|
|
||||||
for (size_t i = values.size() - edge_count; i < values.size(); ++i) {
|
|
||||||
if (i > values.size() - edge_count) {
|
|
||||||
oss << ',';
|
|
||||||
}
|
|
||||||
oss << values[i];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (size_t i = head; i < values.size(); ++i) {
|
|
||||||
oss << ',' << values[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
oss << ']';
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dflash_contract_pos_summary {
|
|
||||||
llama_pos first = -1;
|
|
||||||
llama_pos last = -1;
|
|
||||||
int32_t gap_count = 0;
|
|
||||||
int32_t nonmono_count = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
static dflash_contract_pos_summary dflash_contract_summarize_positions(
|
|
||||||
const std::vector<llama_pos> & positions) {
|
|
||||||
dflash_contract_pos_summary summary;
|
|
||||||
if (positions.empty()) {
|
|
||||||
return summary;
|
|
||||||
}
|
|
||||||
|
|
||||||
summary.first = positions.front();
|
|
||||||
summary.last = positions.back();
|
|
||||||
for (size_t i = 1; i < positions.size(); ++i) {
|
|
||||||
if (positions[i] <= positions[i - 1]) {
|
|
||||||
summary.nonmono_count++;
|
|
||||||
} else if (positions[i] != positions[i - 1] + 1) {
|
|
||||||
summary.gap_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return summary;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct common_speculative_state_dflash;
|
struct common_speculative_state_dflash;
|
||||||
|
|
||||||
static void dflash_contract_log_append(
|
|
||||||
const common_speculative_state_dflash & state,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
const std::vector<llama_pos> & new_positions);
|
|
||||||
static void dflash_contract_log_draft(
|
|
||||||
const common_speculative_state_dflash & state,
|
|
||||||
int32_t n_keep,
|
|
||||||
size_t result_size);
|
|
||||||
static void dflash_materialize_target_window_features(common_speculative_state_dflash & state);
|
static void dflash_materialize_target_window_features(common_speculative_state_dflash & state);
|
||||||
|
|
||||||
// DFlash runtime state and draft path.
|
// DFlash runtime state and draft path.
|
||||||
@ -198,41 +100,6 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
bool target_window_replace = false;
|
bool target_window_replace = false;
|
||||||
bool target_window_materialized = false;
|
bool target_window_materialized = false;
|
||||||
llama_pos last_target_pos = -1;
|
llama_pos last_target_pos = -1;
|
||||||
size_t n_window_updates = 0;
|
|
||||||
size_t n_rows_seen = 0;
|
|
||||||
size_t n_rows_dropped = 0;
|
|
||||||
size_t n_context_shifts = 0;
|
|
||||||
size_t n_draft_empty = 0;
|
|
||||||
size_t n_set_target_fail = 0;
|
|
||||||
size_t n_decode_fail = 0;
|
|
||||||
llama_pos last_draft_pos_base = -1;
|
|
||||||
|
|
||||||
uint64_t t_draft_decode_us = 0;
|
|
||||||
uint64_t t_draft_sample_us = 0;
|
|
||||||
uint64_t t_warmup_collect_us = 0;
|
|
||||||
uint64_t t_warmup_append_us = 0;
|
|
||||||
uint64_t t_accept_output_copy_us = 0;
|
|
||||||
uint64_t t_accept_commit_us = 0;
|
|
||||||
uint64_t t_accept_append_us = 0;
|
|
||||||
uint64_t t_accept_append_filter_us = 0;
|
|
||||||
uint64_t t_accept_append_window_alloc_us = 0;
|
|
||||||
uint64_t t_accept_append_replace_us = 0;
|
|
||||||
uint64_t t_accept_append_keep_old_us = 0;
|
|
||||||
uint64_t t_accept_append_new_rows_us = 0;
|
|
||||||
uint64_t t_accept_append_commit_detail_us = 0;
|
|
||||||
uint64_t t_accept_append_log_us = 0;
|
|
||||||
size_t n_warmup_collect_calls = 0;
|
|
||||||
size_t n_warmup_collect_rows = 0;
|
|
||||||
size_t n_warmup_append_calls = 0;
|
|
||||||
size_t n_warmup_append_rows = 0;
|
|
||||||
size_t n_accept_output_copy_calls = 0;
|
|
||||||
size_t n_accept_output_copy_rows = 0;
|
|
||||||
size_t n_accept_commit_calls = 0;
|
|
||||||
size_t n_accept_commit_rows = 0;
|
|
||||||
size_t n_accept_append_calls = 0;
|
|
||||||
size_t n_accept_append_rows = 0;
|
|
||||||
size_t n_accept_append_replace_calls = 0;
|
|
||||||
size_t n_accept_append_slide_calls = 0;
|
|
||||||
|
|
||||||
common_speculative_state_dflash(
|
common_speculative_state_dflash(
|
||||||
enum common_speculative_type type,
|
enum common_speculative_type type,
|
||||||
@ -271,9 +138,7 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
const auto * vocab_tgt = llama_model_get_vocab(model_tgt);
|
||||||
const auto * vocab_dft = llama_model_get_vocab(model_dft);
|
|
||||||
const int32_t target_vocab_size = llama_vocab_n_tokens(vocab_tgt);
|
const int32_t target_vocab_size = llama_vocab_n_tokens(vocab_tgt);
|
||||||
const int32_t draft_vocab_size = llama_vocab_n_tokens(vocab_dft);
|
|
||||||
const int32_t target_hidden_size = llama_model_n_embd(model_tgt);
|
const int32_t target_hidden_size = llama_model_n_embd(model_tgt);
|
||||||
const int32_t draft_hidden_size = llama_model_n_embd(model_dft);
|
const int32_t draft_hidden_size = llama_model_n_embd(model_dft);
|
||||||
const int32_t target_mask_token_id = llama_model_dflash_target_mask_token_id(model_tgt);
|
const int32_t target_mask_token_id = llama_model_dflash_target_mask_token_id(model_tgt);
|
||||||
@ -349,22 +214,8 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
ready = true;
|
ready = true;
|
||||||
|
|
||||||
llama_set_dflash_visible_cross_ctx(ctx_dft, this->cross_ctx);
|
llama_set_dflash_visible_cross_ctx(ctx_dft, this->cross_ctx);
|
||||||
llama_dflash_profile_reset(ctx_tgt);
|
LOG_INF("%s: DFlash context ready (n_ctx=%d, block_size=%d, cross_ctx=%d, n_target_features=%d, n_target_layers=%d)\n",
|
||||||
llama_dflash_profile_reset(ctx_dft);
|
__func__, llama_n_ctx(ctx_dft), block_size, this->cross_ctx, n_target_features, n_target_layers);
|
||||||
|
|
||||||
std::ostringstream layers_oss;
|
|
||||||
for (size_t i = 0; i < target_layer_ids.size(); ++i) {
|
|
||||||
if (i > 0) {
|
|
||||||
layers_oss << ",";
|
|
||||||
}
|
|
||||||
layers_oss << target_layer_ids[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
const char * io_mode_name = io_mode == LLAMA_DFLASH_IO_MODE_SHARED ? "shared" : "self-contained";
|
|
||||||
LOG_INF("%s: DFlash context ready (n_ctx=%d, block_size=%d, cross_ctx=%d, n_target_features=%d, target_layer_ids=[%s])\n",
|
|
||||||
__func__, llama_n_ctx(ctx_dft), block_size, this->cross_ctx, n_target_features, layers_oss.str().c_str());
|
|
||||||
LOG_INF("%s: DFlash artifact io=%s draft_vocab=%d target_vocab=%d draft_hidden=%d target_hidden=%d mask_token_id=%d target_mask_token_id=%d\n",
|
|
||||||
__func__, io_mode_name, draft_vocab_size, target_vocab_size, draft_hidden_size, target_hidden_size, mask_token_id, target_mask_token_id);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
~common_speculative_state_dflash() override {
|
~common_speculative_state_dflash() override {
|
||||||
@ -381,42 +232,6 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
GGML_UNUSED(prompt);
|
GGML_UNUSED(prompt);
|
||||||
llama_kv_cache_clear(ctx_dft);
|
llama_kv_cache_clear(ctx_dft);
|
||||||
llama_reset_dflash_kv_cache_state(ctx_dft);
|
llama_reset_dflash_kv_cache_state(ctx_dft);
|
||||||
n_window_updates = 0;
|
|
||||||
n_rows_seen = 0;
|
|
||||||
n_rows_dropped = 0;
|
|
||||||
n_context_shifts = 0;
|
|
||||||
n_draft_empty = 0;
|
|
||||||
n_set_target_fail = 0;
|
|
||||||
n_decode_fail = 0;
|
|
||||||
last_draft_pos_base = -1;
|
|
||||||
t_draft_decode_us = 0;
|
|
||||||
t_draft_sample_us = 0;
|
|
||||||
t_warmup_collect_us = 0;
|
|
||||||
t_warmup_append_us = 0;
|
|
||||||
t_accept_output_copy_us = 0;
|
|
||||||
t_accept_commit_us = 0;
|
|
||||||
t_accept_append_us = 0;
|
|
||||||
t_accept_append_filter_us = 0;
|
|
||||||
t_accept_append_window_alloc_us = 0;
|
|
||||||
t_accept_append_replace_us = 0;
|
|
||||||
t_accept_append_keep_old_us = 0;
|
|
||||||
t_accept_append_new_rows_us = 0;
|
|
||||||
t_accept_append_commit_detail_us = 0;
|
|
||||||
t_accept_append_log_us = 0;
|
|
||||||
n_warmup_collect_calls = 0;
|
|
||||||
n_warmup_collect_rows = 0;
|
|
||||||
n_warmup_append_calls = 0;
|
|
||||||
n_warmup_append_rows = 0;
|
|
||||||
n_accept_output_copy_calls = 0;
|
|
||||||
n_accept_output_copy_rows = 0;
|
|
||||||
n_accept_commit_calls = 0;
|
|
||||||
n_accept_commit_rows = 0;
|
|
||||||
n_accept_append_calls = 0;
|
|
||||||
n_accept_append_rows = 0;
|
|
||||||
n_accept_append_replace_calls = 0;
|
|
||||||
n_accept_append_slide_calls = 0;
|
|
||||||
llama_dflash_profile_reset(ctx_tgt);
|
|
||||||
llama_dflash_profile_reset(ctx_dft);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void draft(
|
void draft(
|
||||||
@ -428,7 +243,6 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
|
|
||||||
result.clear();
|
result.clear();
|
||||||
if (!ready || target_window_rows <= 0) {
|
if (!ready || target_window_rows <= 0) {
|
||||||
n_draft_empty++;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -461,7 +275,6 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
|
|
||||||
if (!llama_set_dflash_target_features_view(ctx_dft, target_features, target_feature_floats, target_window_rows, target_window_pos.data(), &window_update)) {
|
if (!llama_set_dflash_target_features_view(ctx_dft, target_features, target_feature_floats, target_window_rows, target_window_pos.data(), &window_update)) {
|
||||||
LOG_ERR("%s: failed to set DFlash target features\n", __func__);
|
LOG_ERR("%s: failed to set DFlash target features\n", __func__);
|
||||||
n_set_target_fail++;
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -470,23 +283,18 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
const int32_t batch_len = n_keep + 1;
|
const int32_t batch_len = n_keep + 1;
|
||||||
const llama_pos draft_pos_base = last_target_pos >= 0 ? last_target_pos + 1 : (llama_pos) target_window_rows;
|
const llama_pos draft_pos_base = last_target_pos >= 0 ? last_target_pos + 1 : (llama_pos) target_window_rows;
|
||||||
const llama_pos seed_pos = last_target_pos >= 0 ? last_target_pos : draft_pos_base - 1;
|
const llama_pos seed_pos = last_target_pos >= 0 ? last_target_pos : draft_pos_base - 1;
|
||||||
last_draft_pos_base = draft_pos_base;
|
|
||||||
common_batch_add(batch, id_last, seed_pos, { 0 }, false);
|
common_batch_add(batch, id_last, seed_pos, { 0 }, false);
|
||||||
for (int32_t i = 1; i < batch_len; ++i) {
|
for (int32_t i = 1; i < batch_len; ++i) {
|
||||||
common_batch_add(batch, mask_token_id, draft_pos_base + (i - 1), { 0 }, i <= n_keep);
|
common_batch_add(batch, mask_token_id, draft_pos_base + (i - 1), { 0 }, i <= n_keep);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_decode_us = ggml_time_us();
|
|
||||||
if (llama_decode(ctx_dft, batch) != 0) {
|
if (llama_decode(ctx_dft, batch) != 0) {
|
||||||
LOG_ERR("%s: llama_decode() failed for DFlash draft batch\n", __func__);
|
LOG_ERR("%s: llama_decode() failed for DFlash draft batch\n", __func__);
|
||||||
n_decode_fail++;
|
|
||||||
batch.n_tokens = 0;
|
batch.n_tokens = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
t_draft_decode_us += (uint64_t) (ggml_time_us() - t_decode_us);
|
|
||||||
|
|
||||||
result.reserve((size_t) n_keep);
|
result.reserve((size_t) n_keep);
|
||||||
const int64_t t_sample_us = ggml_time_us();
|
|
||||||
for (int32_t i = 0; i < n_keep; ++i) {
|
for (int32_t i = 0; i < n_keep; ++i) {
|
||||||
llama_token id = llama_get_dflash_draft_token_ith(ctx_dft, i);
|
llama_token id = llama_get_dflash_draft_token_ith(ctx_dft, i);
|
||||||
if (id == LLAMA_TOKEN_NULL) {
|
if (id == LLAMA_TOKEN_NULL) {
|
||||||
@ -494,10 +302,8 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
}
|
}
|
||||||
result.push_back(id);
|
result.push_back(id);
|
||||||
}
|
}
|
||||||
t_draft_sample_us += (uint64_t) (ggml_time_us() - t_sample_us);
|
|
||||||
|
|
||||||
batch.n_tokens = 0;
|
batch.n_tokens = 0;
|
||||||
dflash_contract_log_draft(*this, n_keep, result.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void accept(uint16_t n_accepted) override {
|
void accept(uint16_t n_accepted) override {
|
||||||
@ -505,104 +311,6 @@ struct common_speculative_state_dflash : public common_speculative_state {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void dflash_contract_log_append(
|
|
||||||
const common_speculative_state_dflash & state,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
const std::vector<llama_pos> & new_positions) {
|
|
||||||
if (!dflash_contract_log_enabled()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> counter = 0;
|
|
||||||
const uint64_t ordinal = counter.fetch_add(1, std::memory_order_relaxed);
|
|
||||||
if (ordinal >= 8) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const dflash_contract_pos_summary incoming = dflash_contract_summarize_positions(new_positions);
|
|
||||||
const dflash_contract_pos_summary window = dflash_contract_summarize_positions(state.target_window_pos);
|
|
||||||
|
|
||||||
LOG_INF("dflash contract append[%llu]: seq=%d incoming_rows=%zu incoming_pos=%s pos=[%d..%d] gaps=%d nonmono=%d window_rows=%d window_pos=%s pos=[%d..%d] gaps=%d nonmono=%d last_target_pos=%d\n",
|
|
||||||
(unsigned long long) (ordinal + 1),
|
|
||||||
(int) seq_id,
|
|
||||||
new_positions.size(),
|
|
||||||
dflash_contract_format_values(new_positions).c_str(),
|
|
||||||
(int) incoming.first,
|
|
||||||
(int) incoming.last,
|
|
||||||
incoming.gap_count,
|
|
||||||
incoming.nonmono_count,
|
|
||||||
state.target_window_rows,
|
|
||||||
dflash_contract_format_values(state.target_window_pos).c_str(),
|
|
||||||
(int) window.first,
|
|
||||||
(int) window.last,
|
|
||||||
window.gap_count,
|
|
||||||
window.nonmono_count,
|
|
||||||
(int) state.last_target_pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void dflash_contract_log_draft(
|
|
||||||
const common_speculative_state_dflash & state,
|
|
||||||
int32_t n_keep,
|
|
||||||
size_t result_size) {
|
|
||||||
if (!dflash_contract_log_enabled()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> counter = 0;
|
|
||||||
const uint64_t ordinal = counter.fetch_add(1, std::memory_order_relaxed);
|
|
||||||
if (ordinal >= 8) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const dflash_contract_pos_summary window = dflash_contract_summarize_positions(state.target_window_pos);
|
|
||||||
llama_dflash_profile_stats graph_stats = {};
|
|
||||||
llama_dflash_profile_get_stats(state.ctx_dft, &graph_stats);
|
|
||||||
const int draft_delta = (state.last_target_pos >= 0 && state.last_draft_pos_base >= 0)
|
|
||||||
? (int) (state.last_draft_pos_base - state.last_target_pos)
|
|
||||||
: -1;
|
|
||||||
const llama_pos seed_pos = state.last_target_pos;
|
|
||||||
const llama_pos mask_first_pos = state.last_draft_pos_base;
|
|
||||||
const llama_pos mask_last_pos = state.last_draft_pos_base >= 0
|
|
||||||
? state.last_draft_pos_base + n_keep - 1
|
|
||||||
: -1;
|
|
||||||
|
|
||||||
LOG_INF("dflash contract draft[%llu]: window_rows=%d window_pos=%s pos=[%d..%d] gaps=%d nonmono=%d last_target_pos=%d seed_pos=%d mask_pos=[%d..%d] sample_rows=[1..%d] output_rows=[1..%d] draft_pos_base=%d delta=%d n_keep=%d result=%zu set_target(missing/nonmono)=%llu/%llu graph(fallback/nonmono)=%llu/%llu graph_pos=[%d..%d]\n",
|
|
||||||
(unsigned long long) (ordinal + 1),
|
|
||||||
state.target_window_rows,
|
|
||||||
dflash_contract_format_values(state.target_window_pos).c_str(),
|
|
||||||
(int) window.first,
|
|
||||||
(int) window.last,
|
|
||||||
window.gap_count,
|
|
||||||
window.nonmono_count,
|
|
||||||
(int) state.last_target_pos,
|
|
||||||
(int) seed_pos,
|
|
||||||
(int) mask_first_pos,
|
|
||||||
(int) mask_last_pos,
|
|
||||||
n_keep,
|
|
||||||
n_keep,
|
|
||||||
(int) state.last_draft_pos_base,
|
|
||||||
draft_delta,
|
|
||||||
n_keep,
|
|
||||||
result_size,
|
|
||||||
(unsigned long long) graph_stats.set_target_missing_positions,
|
|
||||||
(unsigned long long) graph_stats.set_target_non_monotonic_positions,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_fallbacks,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_non_monotonic,
|
|
||||||
(int) graph_stats.last_pos_first,
|
|
||||||
(int) graph_stats.last_pos_last);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct dflash_append_breakdown {
|
|
||||||
uint64_t filter_us = 0;
|
|
||||||
uint64_t window_alloc_us = 0;
|
|
||||||
uint64_t replace_us = 0;
|
|
||||||
uint64_t keep_old_us = 0;
|
|
||||||
uint64_t new_rows_us = 0;
|
|
||||||
uint64_t commit_us = 0;
|
|
||||||
uint64_t log_us = 0;
|
|
||||||
bool replace_call = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void dflash_record_window_update(
|
static void dflash_record_window_update(
|
||||||
common_speculative_state_dflash & state,
|
common_speculative_state_dflash & state,
|
||||||
int32_t keep_rows,
|
int32_t keep_rows,
|
||||||
@ -696,11 +404,7 @@ static void dflash_materialize_target_window_features(common_speculative_state_d
|
|||||||
static bool dflash_append_target_features(
|
static bool dflash_append_target_features(
|
||||||
common_speculative_state_dflash & state,
|
common_speculative_state_dflash & state,
|
||||||
const common_speculative_feature_view & features,
|
const common_speculative_feature_view & features,
|
||||||
const llama_batch & batch,
|
llama_seq_id seq_id) {
|
||||||
llama_seq_id seq_id,
|
|
||||||
dflash_append_breakdown * breakdown = nullptr) {
|
|
||||||
GGML_UNUSED(batch);
|
|
||||||
|
|
||||||
if (features.kind != COMMON_SPECULATIVE_FEATURE_HIDDEN_STATE ||
|
if (features.kind != COMMON_SPECULATIVE_FEATURE_HIDDEN_STATE ||
|
||||||
features.width != state.n_target_features ||
|
features.width != state.n_target_features ||
|
||||||
features.rows.empty() ||
|
features.rows.empty() ||
|
||||||
@ -714,7 +418,6 @@ static bool dflash_append_target_features(
|
|||||||
new_rows.reserve(features.rows.size() * row_width);
|
new_rows.reserve(features.rows.size() * row_width);
|
||||||
new_positions.reserve(features.rows.size());
|
new_positions.reserve(features.rows.size());
|
||||||
|
|
||||||
const int64_t t_filter_us = ggml_time_us();
|
|
||||||
for (const auto & row : features.rows) {
|
for (const auto & row : features.rows) {
|
||||||
if (row.seq_id != seq_id || row.data == nullptr) {
|
if (row.seq_id != seq_id || row.data == nullptr) {
|
||||||
continue;
|
continue;
|
||||||
@ -723,89 +426,45 @@ static bool dflash_append_target_features(
|
|||||||
new_positions.push_back(row.pos);
|
new_positions.push_back(row.pos);
|
||||||
new_rows.insert(new_rows.end(), row.data, row.data + row_width);
|
new_rows.insert(new_rows.end(), row.data, row.data + row_width);
|
||||||
}
|
}
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->filter_us += (uint64_t) (ggml_time_us() - t_filter_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (new_positions.empty()) {
|
if (new_positions.empty()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t n_rows = (int32_t) new_positions.size();
|
const int32_t n_rows = (int32_t) new_positions.size();
|
||||||
state.n_window_updates++;
|
|
||||||
state.n_rows_seen += (size_t) n_rows;
|
|
||||||
if (n_rows >= state.cross_ctx) {
|
if (n_rows >= state.cross_ctx) {
|
||||||
state.n_rows_dropped += (size_t) state.target_window_rows + (size_t) (n_rows - state.cross_ctx);
|
|
||||||
const int32_t keep_from = n_rows - state.cross_ctx;
|
const int32_t keep_from = n_rows - state.cross_ctx;
|
||||||
const int64_t t_replace_us = ggml_time_us();
|
|
||||||
state.target_window_pos.assign(new_positions.begin() + keep_from, new_positions.end());
|
state.target_window_pos.assign(new_positions.begin() + keep_from, new_positions.end());
|
||||||
state.target_window_append_features.assign(
|
state.target_window_append_features.assign(
|
||||||
new_rows.begin() + (ptrdiff_t) keep_from * (ptrdiff_t) row_width,
|
new_rows.begin() + (ptrdiff_t) keep_from * (ptrdiff_t) row_width,
|
||||||
new_rows.end());
|
new_rows.end());
|
||||||
dflash_ring_reset_rows(state, state.target_window_append_features.data(), state.cross_ctx);
|
dflash_ring_reset_rows(state, state.target_window_append_features.data(), state.cross_ctx);
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->replace_us += (uint64_t) (ggml_time_us() - t_replace_us);
|
|
||||||
breakdown->replace_call = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_commit_us = ggml_time_us();
|
|
||||||
state.target_window_rows = state.cross_ctx;
|
state.target_window_rows = state.cross_ctx;
|
||||||
state.target_window_ring_filled = state.target_window_rows;
|
state.target_window_ring_filled = state.target_window_rows;
|
||||||
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
||||||
dflash_record_window_update(state, 0, state.target_window_rows, true);
|
dflash_record_window_update(state, 0, state.target_window_rows, true);
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->commit_us += (uint64_t) (ggml_time_us() - t_commit_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_log_us = ggml_time_us();
|
|
||||||
dflash_contract_log_append(state, seq_id, new_positions);
|
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->log_us += (uint64_t) (ggml_time_us() - t_log_us);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t keep_old_rows = std::min<int32_t>(state.target_window_rows, state.cross_ctx - n_rows);
|
const int32_t keep_old_rows = std::min<int32_t>(state.target_window_rows, state.cross_ctx - n_rows);
|
||||||
state.n_rows_dropped += (size_t) std::max<int32_t>(0, state.target_window_rows - keep_old_rows);
|
|
||||||
const int64_t t_window_alloc_us = ggml_time_us();
|
|
||||||
std::vector<llama_pos> & next_window_pos = state.target_window_pos_stage;
|
std::vector<llama_pos> & next_window_pos = state.target_window_pos_stage;
|
||||||
next_window_pos.resize((size_t) (keep_old_rows + n_rows));
|
next_window_pos.resize((size_t) (keep_old_rows + n_rows));
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->window_alloc_us += (uint64_t) (ggml_time_us() - t_window_alloc_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (keep_old_rows > 0) {
|
if (keep_old_rows > 0) {
|
||||||
const int64_t t_keep_old_us = ggml_time_us();
|
|
||||||
std::copy(state.target_window_pos.end() - keep_old_rows, state.target_window_pos.end(), next_window_pos.begin());
|
std::copy(state.target_window_pos.end() - keep_old_rows, state.target_window_pos.end(), next_window_pos.begin());
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->keep_old_us += (uint64_t) (ggml_time_us() - t_keep_old_us);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_new_rows_us = ggml_time_us();
|
|
||||||
state.target_window_append_features.assign(new_rows.begin(), new_rows.end());
|
state.target_window_append_features.assign(new_rows.begin(), new_rows.end());
|
||||||
dflash_ring_append_rows(state, state.target_window_append_features.data(), n_rows);
|
dflash_ring_append_rows(state, state.target_window_append_features.data(), n_rows);
|
||||||
std::copy(new_positions.begin(), new_positions.end(), next_window_pos.begin() + keep_old_rows);
|
std::copy(new_positions.begin(), new_positions.end(), next_window_pos.begin() + keep_old_rows);
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->new_rows_us += (uint64_t) (ggml_time_us() - t_new_rows_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_commit_us = ggml_time_us();
|
|
||||||
state.target_window_pos.swap(next_window_pos);
|
state.target_window_pos.swap(next_window_pos);
|
||||||
next_window_pos.clear();
|
next_window_pos.clear();
|
||||||
state.target_window_rows = keep_old_rows + n_rows;
|
state.target_window_rows = keep_old_rows + n_rows;
|
||||||
state.target_window_ring_filled = state.target_window_rows;
|
state.target_window_ring_filled = state.target_window_rows;
|
||||||
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
||||||
dflash_record_window_update(state, keep_old_rows, n_rows, false);
|
dflash_record_window_update(state, keep_old_rows, n_rows, false);
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->commit_us += (uint64_t) (ggml_time_us() - t_commit_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_log_us = ggml_time_us();
|
|
||||||
dflash_contract_log_append(state, seq_id, new_positions);
|
|
||||||
if (breakdown != nullptr) {
|
|
||||||
breakdown->log_us += (uint64_t) (ggml_time_us() - t_log_us);
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -868,5 +527,4 @@ static void dflash_context_shift(
|
|||||||
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
state.last_target_pos = state.target_window_pos.empty() ? -1 : state.target_window_pos.back();
|
||||||
dflash_record_window_update(state, 0, state.target_window_rows, true);
|
dflash_record_window_update(state, 0, state.target_window_rows, true);
|
||||||
llama_reset_dflash_kv_cache_state(state.ctx_dft);
|
llama_reset_dflash_kv_cache_state(state.ctx_dft);
|
||||||
state.n_context_shifts++;
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2039,8 +2039,6 @@ int32_t common_speculative_on_target_seq_batch(
|
|||||||
const llama_batch * batch_for_spec = &batch;
|
const llama_batch * batch_for_spec = &batch;
|
||||||
llama_batch seq_batch = {};
|
llama_batch seq_batch = {};
|
||||||
const bool needs_seq_split = is_prompt_warmup && !common_speculative_batch_is_exact_single_seq(batch, seq_id);
|
const bool needs_seq_split = is_prompt_warmup && !common_speculative_batch_is_exact_single_seq(batch, seq_id);
|
||||||
auto * dflash_state = common_speculative_get_dflash_state(spec);
|
|
||||||
const bool measure_dflash_warmup_collect = dflash_state != nullptr && is_prompt_warmup;
|
|
||||||
|
|
||||||
if (needs_seq_split) {
|
if (needs_seq_split) {
|
||||||
const int n_seq_tokens = common_speculative_copy_seq_batch(batch, seq_id, seq_batch);
|
const int n_seq_tokens = common_speculative_copy_seq_batch(batch, seq_id, seq_batch);
|
||||||
@ -2048,28 +2046,16 @@ int32_t common_speculative_on_target_seq_batch(
|
|||||||
return n_seq_tokens < 0 ? -1 : 0;
|
return n_seq_tokens < 0 ? -1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_collect_us = measure_dflash_warmup_collect ? ggml_time_us() : 0;
|
|
||||||
if (!common_speculative_collect_target_seq_batch_features(spec, ctx_tgt, batch, seq_id, feature_view)) {
|
if (!common_speculative_collect_target_seq_batch_features(spec, ctx_tgt, batch, seq_id, feature_view)) {
|
||||||
llama_batch_free(seq_batch);
|
llama_batch_free(seq_batch);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (measure_dflash_warmup_collect) {
|
|
||||||
dflash_state->t_warmup_collect_us += (uint64_t) (ggml_time_us() - t_collect_us);
|
|
||||||
dflash_state->n_warmup_collect_calls++;
|
|
||||||
dflash_state->n_warmup_collect_rows += (size_t) n_seq_tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
batch_for_spec = &seq_batch;
|
batch_for_spec = &seq_batch;
|
||||||
} else {
|
} else {
|
||||||
const int64_t t_collect_us = measure_dflash_warmup_collect ? ggml_time_us() : 0;
|
|
||||||
if (!common_speculative_collect_target_batch_features(spec, ctx_tgt, batch, feature_view)) {
|
if (!common_speculative_collect_target_batch_features(spec, ctx_tgt, batch, feature_view)) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (measure_dflash_warmup_collect) {
|
|
||||||
dflash_state->t_warmup_collect_us += (uint64_t) (ggml_time_us() - t_collect_us);
|
|
||||||
dflash_state->n_warmup_collect_calls++;
|
|
||||||
dflash_state->n_warmup_collect_rows += (size_t) batch.n_tokens;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t ret = common_speculative_on_target_batch(spec, *batch_for_spec, feature_view, is_prompt_warmup);
|
const int32_t ret = common_speculative_on_target_batch(spec, *batch_for_spec, feature_view, is_prompt_warmup);
|
||||||
@ -2170,16 +2156,7 @@ bool common_speculative_commit_accepted_hidden_rows(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto * dflash_state = common_speculative_get_dflash_state(spec);
|
return common_speculative_apply_hidden_rows(spec, seq_id, pos_base, commit_tokens, hidden_rows);
|
||||||
const int64_t t_commit_us = dflash_state != nullptr ? ggml_time_us() : 0;
|
|
||||||
const bool ok = common_speculative_apply_hidden_rows(spec, seq_id, pos_base, commit_tokens, hidden_rows);
|
|
||||||
if (dflash_state != nullptr) {
|
|
||||||
dflash_state->t_accept_commit_us += (uint64_t) (ggml_time_us() - t_commit_us);
|
|
||||||
dflash_state->n_accept_commit_calls++;
|
|
||||||
dflash_state->n_accept_commit_rows += commit_tokens.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
return ok;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool common_speculative_commit_accepted_output(
|
bool common_speculative_commit_accepted_output(
|
||||||
@ -2196,16 +2173,9 @@ bool common_speculative_commit_accepted_output(
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> hidden_rows;
|
std::vector<float> hidden_rows;
|
||||||
auto * dflash_state = common_speculative_get_dflash_state(spec);
|
|
||||||
const int64_t t_copy_us = dflash_state != nullptr ? ggml_time_us() : 0;
|
|
||||||
if (!common_speculative_copy_output_hidden_rows(spec, ctx, output_indices, hidden_rows)) {
|
if (!common_speculative_copy_output_hidden_rows(spec, ctx, output_indices, hidden_rows)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (dflash_state != nullptr) {
|
|
||||||
dflash_state->t_accept_output_copy_us += (uint64_t) (ggml_time_us() - t_copy_us);
|
|
||||||
dflash_state->n_accept_output_copy_calls++;
|
|
||||||
dflash_state->n_accept_output_copy_rows += output_indices.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
return common_speculative_commit_accepted_hidden_rows(
|
return common_speculative_commit_accepted_hidden_rows(
|
||||||
spec,
|
spec,
|
||||||
@ -2471,341 +2441,6 @@ void common_speculative_print_stats(const common_speculative * spec, double slot
|
|||||||
impl->n_acc_tokens,
|
impl->n_acc_tokens,
|
||||||
str_perf.c_str());
|
str_perf.c_str());
|
||||||
|
|
||||||
if (impl->type == COMMON_SPECULATIVE_TYPE_DFLASH) {
|
|
||||||
const auto * dflash_state = dynamic_cast<const common_speculative_state_dflash *>(impl.get());
|
|
||||||
if (dflash_state != nullptr && dflash_stats_log_enabled()) {
|
|
||||||
llama_dflash_profile_stats capture_stats;
|
|
||||||
llama_dflash_profile_stats graph_stats;
|
|
||||||
const bool have_capture = llama_dflash_profile_get_stats(dflash_state->ctx_tgt, &capture_stats);
|
|
||||||
const bool have_graph = llama_dflash_profile_get_stats(dflash_state->ctx_dft, &graph_stats);
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash detail: cross_ctx=%d, window_rows=%d, pos=[%d..%d], window_updates=%zu, rows_seen=%zu, rows_dropped=%zu, shifts=%zu, draft_fail(empty/set/decode)=%zu/%zu/%zu, next_draft_pos=%d\n",
|
|
||||||
dflash_state->cross_ctx,
|
|
||||||
dflash_state->target_window_rows,
|
|
||||||
dflash_state->target_window_pos.empty() ? -1 : (int) dflash_state->target_window_pos.front(),
|
|
||||||
dflash_state->target_window_pos.empty() ? -1 : (int) dflash_state->target_window_pos.back(),
|
|
||||||
dflash_state->n_window_updates,
|
|
||||||
dflash_state->n_rows_seen,
|
|
||||||
dflash_state->n_rows_dropped,
|
|
||||||
dflash_state->n_context_shifts,
|
|
||||||
dflash_state->n_draft_empty,
|
|
||||||
dflash_state->n_set_target_fail,
|
|
||||||
dflash_state->n_decode_fail,
|
|
||||||
(int) dflash_state->last_draft_pos_base);
|
|
||||||
|
|
||||||
if (have_capture || have_graph) {
|
|
||||||
const double kv_cache_total_ms = (double) (
|
|
||||||
graph_stats.graph_kv_cache_build_us +
|
|
||||||
graph_stats.graph_kv_cache_reserve_us +
|
|
||||||
graph_stats.graph_kv_cache_reset_us +
|
|
||||||
graph_stats.graph_kv_cache_alloc_us +
|
|
||||||
graph_stats.graph_kv_cache_feature_upload_us +
|
|
||||||
graph_stats.graph_kv_cache_pos_upload_us +
|
|
||||||
graph_stats.graph_kv_cache_compute_us +
|
|
||||||
graph_stats.graph_kv_cache_sync_us +
|
|
||||||
graph_stats.graph_kv_cache_read_concat_pad_us) / 1000.0;
|
|
||||||
const double kv_upload_feature_ms = (double) graph_stats.graph_kv_cache_feature_upload_us / 1000.0;
|
|
||||||
const double kv_upload_pos_ms = (double) graph_stats.graph_kv_cache_pos_upload_us / 1000.0;
|
|
||||||
const double kv_upload_total_ms = kv_upload_feature_ms + kv_upload_pos_ms;
|
|
||||||
const double kv_compute_ms = (double) graph_stats.graph_kv_cache_compute_us / 1000.0;
|
|
||||||
const double kv_sync_ms = (double) graph_stats.graph_kv_cache_sync_us / 1000.0;
|
|
||||||
const double kv_workspace_total_ms = (double) (
|
|
||||||
graph_stats.graph_kv_workspace_build_us +
|
|
||||||
graph_stats.graph_kv_workspace_reserve_us +
|
|
||||||
graph_stats.graph_kv_workspace_reset_us +
|
|
||||||
graph_stats.graph_kv_workspace_alloc_us +
|
|
||||||
graph_stats.graph_kv_workspace_compute_us +
|
|
||||||
graph_stats.graph_kv_workspace_sync_us) / 1000.0;
|
|
||||||
const double draft_kv_traffic_ms = (double) (
|
|
||||||
graph_stats.graph_main_node_k_ctx_view_us +
|
|
||||||
graph_stats.graph_main_node_v_ctx_view_us +
|
|
||||||
graph_stats.graph_main_node_k_concat_us +
|
|
||||||
graph_stats.graph_main_node_v_concat_us +
|
|
||||||
graph_stats.graph_main_node_k_pad_us +
|
|
||||||
graph_stats.graph_main_node_v_pad_us +
|
|
||||||
graph_stats.graph_main_node_k_perm_cont_us +
|
|
||||||
graph_stats.graph_main_node_v_perm_cont_us) / 1000.0;
|
|
||||||
const double draft_main_profiled_ms = (double) (
|
|
||||||
graph_stats.graph_main_node_qcur_us +
|
|
||||||
graph_stats.graph_main_node_k_draft_us +
|
|
||||||
graph_stats.graph_main_node_v_draft_us +
|
|
||||||
graph_stats.graph_main_node_flash_attn_us +
|
|
||||||
graph_stats.graph_main_node_attn_out_us +
|
|
||||||
graph_stats.graph_main_node_ffn_us +
|
|
||||||
graph_stats.graph_main_node_result_rows_us +
|
|
||||||
graph_stats.graph_main_node_result_norm_us +
|
|
||||||
graph_stats.graph_main_node_result_us) / 1000.0;
|
|
||||||
const double replay_append_ms = (double) dflash_state->t_accept_append_us / 1000.0;
|
|
||||||
const double feature_path_ms = (double) (
|
|
||||||
capture_stats.capture_prepare_sync_us +
|
|
||||||
capture_stats.capture_materialize_us +
|
|
||||||
graph_stats.set_target_copy_us +
|
|
||||||
graph_stats.graph_feature_copy_us +
|
|
||||||
graph_stats.graph_pos_copy_us +
|
|
||||||
graph_stats.graph_mask_build_us) / 1000.0;
|
|
||||||
const double decode_internal_ms = (double) (
|
|
||||||
graph_stats.decode_prelude_us +
|
|
||||||
graph_stats.decode_sched_reset_us +
|
|
||||||
graph_stats.decode_build_graph_us +
|
|
||||||
graph_stats.decode_sched_alloc_graph_us +
|
|
||||||
graph_stats.decode_prepare_us +
|
|
||||||
graph_stats.decode_set_inputs_us +
|
|
||||||
graph_stats.decode_graph_compute_us +
|
|
||||||
graph_stats.decode_result_us +
|
|
||||||
graph_stats.decode_embedding_us +
|
|
||||||
graph_stats.decode_final_sched_reset_us) / 1000.0;
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash profile: capture(sync/materialize)=%.3f/%.3f ms calls=%llu/%llu bytes=%llu phase(prompt/verify batches changes)=%llu/%llu %llu/%llu, set_target=%.3f ms rows=%llu bytes=%llu, decode(llama_output_reserve/prepare)=%.3f/%.3f ms calls=%llu/%llu realloc(bytes)=%llu/%llu, prep(total/features/pos/mask)=%.3f/%.3f/%.3f/%.3f ms kv_cache(total/build/reserve/reset/alloc/up_f/up_p/compute/sync/read)=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls(prepare/cache/read)=%llu/%llu/%llu bytes(feature/pos/mask/read)=%llu/%llu/%llu/%llu host_layers=%d, fallback_pos(copy/graph)=%llu/%llu, nonmono(copy/graph)=%llu/%llu, capture_fail=%llu/%llu decode_prepare_fail=%llu, visible_kv_max=%llu, last(rows=%d width=%d left_pad=%d n_tokens=%d n_kv=%d pos=[%d..%d])\n",
|
|
||||||
(double) capture_stats.capture_prepare_sync_us / 1000.0,
|
|
||||||
(double) capture_stats.capture_materialize_us / 1000.0,
|
|
||||||
(unsigned long long) capture_stats.capture_prepare_calls,
|
|
||||||
(unsigned long long) capture_stats.capture_materialize_calls,
|
|
||||||
(unsigned long long) capture_stats.capture_materialize_bytes,
|
|
||||||
(unsigned long long) capture_stats.capture_prompt_batches,
|
|
||||||
(unsigned long long) capture_stats.capture_prompt_shape_changes,
|
|
||||||
(unsigned long long) capture_stats.capture_verify_batches,
|
|
||||||
(unsigned long long) capture_stats.capture_verify_shape_changes,
|
|
||||||
(double) graph_stats.set_target_copy_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.set_target_rows,
|
|
||||||
(unsigned long long) graph_stats.set_target_copy_bytes,
|
|
||||||
(double) graph_stats.decode_output_reserve_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_prepare_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.decode_output_reserve_calls,
|
|
||||||
(unsigned long long) graph_stats.decode_prepare_calls,
|
|
||||||
(unsigned long long) graph_stats.decode_output_reserve_reallocs,
|
|
||||||
(unsigned long long) graph_stats.decode_output_reserve_realloc_bytes,
|
|
||||||
(double) graph_stats.graph_prepare_total_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_feature_copy_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_pos_copy_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_mask_build_us / 1000.0,
|
|
||||||
kv_cache_total_ms,
|
|
||||||
(double) graph_stats.graph_kv_cache_build_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_reserve_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_reset_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_alloc_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_feature_upload_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_pos_upload_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_sync_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_read_concat_pad_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_prepare_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_read_concat_pad_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_feature_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_mask_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_cached_bytes,
|
|
||||||
graph_stats.last_kv_cache_host_layers,
|
|
||||||
(unsigned long long) graph_stats.set_target_missing_positions,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_fallbacks,
|
|
||||||
(unsigned long long) graph_stats.set_target_non_monotonic_positions,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_non_monotonic,
|
|
||||||
(unsigned long long) capture_stats.capture_prepare_failures,
|
|
||||||
(unsigned long long) capture_stats.capture_materialize_failures,
|
|
||||||
(unsigned long long) graph_stats.decode_prepare_failures,
|
|
||||||
(unsigned long long) graph_stats.graph_visible_kv_max,
|
|
||||||
graph_stats.last_n_rows,
|
|
||||||
graph_stats.last_width,
|
|
||||||
graph_stats.last_left_pad,
|
|
||||||
graph_stats.last_n_tokens,
|
|
||||||
graph_stats.last_n_kv_total,
|
|
||||||
(int) graph_stats.last_pos_first,
|
|
||||||
(int) graph_stats.last_pos_last);
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash features: total=%.3f ms capture(sync/materialize)=%.3f/%.3f ms set_target=%.3f ms prep(feature/pos/mask)=%.3f/%.3f/%.3f ms rows(materialize/set_target)=%llu/%llu bytes(materialize/set_target/feature/pos/mask)=%llu/%llu/%llu/%llu/%llu\n",
|
|
||||||
feature_path_ms,
|
|
||||||
(double) capture_stats.capture_prepare_sync_us / 1000.0,
|
|
||||||
(double) capture_stats.capture_materialize_us / 1000.0,
|
|
||||||
(double) graph_stats.set_target_copy_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_feature_copy_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_pos_copy_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_mask_build_us / 1000.0,
|
|
||||||
(unsigned long long) capture_stats.capture_materialize_rows,
|
|
||||||
(unsigned long long) graph_stats.set_target_rows,
|
|
||||||
(unsigned long long) capture_stats.capture_materialize_bytes,
|
|
||||||
(unsigned long long) graph_stats.set_target_copy_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_feature_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_pos_bytes,
|
|
||||||
(unsigned long long) graph_stats.graph_mask_bytes);
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash kv: total=%.3f ms build/reserve/reset/alloc/upload_f/upload_p/compute/sync/read=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu cached_bytes=%llu host_layers=%d\n",
|
|
||||||
kv_cache_total_ms,
|
|
||||||
(double) graph_stats.graph_kv_cache_build_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_reserve_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_reset_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_alloc_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_feature_upload_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_pos_upload_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_sync_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_cache_read_concat_pad_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_cached_bytes,
|
|
||||||
graph_stats.last_kv_cache_host_layers);
|
|
||||||
|
|
||||||
if (graph_stats.graph_kv_workspace_calls > 0) {
|
|
||||||
LOG_INF("statistics dflash kv workspace: total=%.3f ms build/reserve/reset/alloc/compute/sync=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu\n",
|
|
||||||
kv_workspace_total_ms,
|
|
||||||
(double) graph_stats.graph_kv_workspace_build_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_workspace_reserve_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_workspace_reset_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_workspace_alloc_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_workspace_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_workspace_sync_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_workspace_calls);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (graph_stats.decode_internal_chunks > 0) {
|
|
||||||
LOG_INF("statistics dflash decode: llama_decode(total)=%.3f ms calls=%zu chunks=%llu rebuilds=%llu sync_points=%llu internal(total/prelude/sched_reset/build/alloc/prepare/set_inputs/compute/get_result/get_embedding/final_reset)=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms\n",
|
|
||||||
(double) dflash_state->t_draft_decode_us / 1000.0,
|
|
||||||
dflash_state->n_call_draft,
|
|
||||||
(unsigned long long) graph_stats.decode_internal_chunks,
|
|
||||||
(unsigned long long) graph_stats.decode_graph_rebuilds,
|
|
||||||
(unsigned long long) graph_stats.decode_sync_profile_points,
|
|
||||||
decode_internal_ms,
|
|
||||||
(double) graph_stats.decode_prelude_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_sched_reset_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_build_graph_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_sched_alloc_graph_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_prepare_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_set_inputs_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_graph_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_result_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_embedding_us / 1000.0,
|
|
||||||
(double) graph_stats.decode_final_sched_reset_us / 1000.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (graph_stats.graph_kv_node_fused_target_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_k_proj_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_k_norm_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_k_rope_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_v_proj_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_k_store_calls > 0 ||
|
|
||||||
graph_stats.graph_kv_node_v_store_calls > 0) {
|
|
||||||
LOG_INF("statistics dflash kv nodes: fused_target/k_proj/k_norm/k_rope/v_proj/k_store/v_store=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu/%llu/%llu/%llu/%llu/%llu/%llu\n",
|
|
||||||
(double) graph_stats.graph_kv_node_fused_target_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_k_proj_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_k_norm_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_k_rope_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_v_proj_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_k_store_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_kv_node_v_store_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_fused_target_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_k_proj_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_k_norm_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_k_rope_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_v_proj_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_k_store_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_node_v_store_calls);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (graph_stats.graph_main_node_qcur_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_k_draft_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_v_draft_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_flash_attn_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_attn_out_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_ffn_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_result_rows_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_result_norm_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_result_calls > 0) {
|
|
||||||
LOG_INF("statistics dflash draft nodes: profiled=%.3f ms graph_compute=%.3f ms qcur/k_draft/v_draft/flash_attn/attn_out/ffn/result_rows/result_norm/result=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu/%llu/%llu/%llu/%llu/%llu/%llu/%llu/%llu\n",
|
|
||||||
draft_main_profiled_ms,
|
|
||||||
(double) graph_stats.decode_graph_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_qcur_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_k_draft_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_v_draft_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_flash_attn_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_attn_out_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_ffn_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_result_rows_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_result_norm_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_result_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_qcur_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_k_draft_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_v_draft_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_flash_attn_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_attn_out_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_ffn_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_result_rows_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_result_norm_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_result_calls);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (graph_stats.graph_main_node_k_ctx_view_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_v_ctx_view_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_k_concat_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_v_concat_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_k_pad_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_v_pad_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_k_perm_cont_calls > 0 ||
|
|
||||||
graph_stats.graph_main_node_v_perm_cont_calls > 0) {
|
|
||||||
LOG_INF("statistics dflash draft kv traffic: total=%.3f ms graph_compute=%.3f ms k_ctx_view/v_ctx_view/k_concat/v_concat/k_pad/v_pad/k_perm_cont/v_perm_cont=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu/%llu/%llu/%llu/%llu/%llu/%llu/%llu\n",
|
|
||||||
draft_kv_traffic_ms,
|
|
||||||
(double) graph_stats.decode_graph_compute_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_k_ctx_view_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_v_ctx_view_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_k_concat_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_v_concat_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_k_pad_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_v_pad_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_k_perm_cont_us / 1000.0,
|
|
||||||
(double) graph_stats.graph_main_node_v_perm_cont_us / 1000.0,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_k_ctx_view_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_v_ctx_view_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_k_concat_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_v_concat_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_k_pad_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_v_pad_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_k_perm_cont_calls,
|
|
||||||
(unsigned long long) graph_stats.graph_main_node_v_perm_cont_calls);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash hot: kv(upload_f/upload_p/upload/compute/sync)=%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%llu replay(accepted_prefix_append)=%.3f ms calls=%zu rows=%zu\n",
|
|
||||||
kv_upload_feature_ms,
|
|
||||||
kv_upload_pos_ms,
|
|
||||||
kv_upload_total_ms,
|
|
||||||
kv_compute_ms,
|
|
||||||
kv_sync_ms,
|
|
||||||
(unsigned long long) graph_stats.graph_kv_cache_calls,
|
|
||||||
replay_append_ms,
|
|
||||||
dflash_state->n_accept_append_calls,
|
|
||||||
dflash_state->n_accept_append_rows);
|
|
||||||
|
|
||||||
LOG_INF("statistics dflash stages: draft(decode/sample)=%.3f/%.3f ms warmup(collect/append)=%.3f/%.3f ms calls=%zu/%zu rows=%zu/%zu accept(total/output_copy/append)=%.3f/%.3f/%.3f ms calls=%zu/%zu/%zu rows=%zu/%zu/%zu\n",
|
|
||||||
(double) dflash_state->t_draft_decode_us / 1000.0,
|
|
||||||
(double) dflash_state->t_draft_sample_us / 1000.0,
|
|
||||||
(double) dflash_state->t_warmup_collect_us / 1000.0,
|
|
||||||
(double) dflash_state->t_warmup_append_us / 1000.0,
|
|
||||||
dflash_state->n_warmup_collect_calls,
|
|
||||||
dflash_state->n_warmup_append_calls,
|
|
||||||
dflash_state->n_warmup_collect_rows,
|
|
||||||
dflash_state->n_warmup_append_rows,
|
|
||||||
(double) dflash_state->t_accept_commit_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_output_copy_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_us / 1000.0,
|
|
||||||
dflash_state->n_accept_commit_calls,
|
|
||||||
dflash_state->n_accept_output_copy_calls,
|
|
||||||
dflash_state->n_accept_append_calls,
|
|
||||||
dflash_state->n_accept_commit_rows,
|
|
||||||
dflash_state->n_accept_output_copy_rows,
|
|
||||||
dflash_state->n_accept_append_rows);
|
|
||||||
|
|
||||||
if (dflash_state->n_accept_append_calls > 0) {
|
|
||||||
LOG_INF("statistics dflash replay: append(filter/window_alloc/replace/keep_old/new_rows/commit/log)=%.3f/%.3f/%.3f/%.3f/%.3f/%.3f/%.3f ms calls=%zu replace/slide=%zu/%zu\n",
|
|
||||||
(double) dflash_state->t_accept_append_filter_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_window_alloc_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_replace_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_keep_old_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_new_rows_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_commit_detail_us / 1000.0,
|
|
||||||
(double) dflash_state->t_accept_append_log_us / 1000.0,
|
|
||||||
dflash_state->n_accept_append_calls,
|
|
||||||
dflash_state->n_accept_append_replace_calls,
|
|
||||||
dflash_state->n_accept_append_slide_calls);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (spec->tuner && spec->tuner->enabled && slot_tps > 0.0 && n_decoded > 0) {
|
if (spec->tuner && spec->tuner->enabled && slot_tps > 0.0 && n_decoded > 0) {
|
||||||
@ -3076,35 +2711,9 @@ int32_t common_speculative_on_target_batch(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
dflash_append_breakdown append_breakdown;
|
if (!dflash_append_target_features(*dflash_state, features, seq_id)) {
|
||||||
const int64_t t_append_us = ggml_time_us();
|
|
||||||
if (!dflash_append_target_features(*dflash_state, features, batch, seq_id, &append_breakdown)) {
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint64_t append_us = (uint64_t) (ggml_time_us() - t_append_us);
|
|
||||||
if (is_prompt_warmup) {
|
|
||||||
dflash_state->t_warmup_append_us += append_us;
|
|
||||||
dflash_state->n_warmup_append_calls++;
|
|
||||||
dflash_state->n_warmup_append_rows += (size_t) batch.n_tokens;
|
|
||||||
} else {
|
|
||||||
dflash_state->t_accept_append_us += append_us;
|
|
||||||
dflash_state->t_accept_append_filter_us += append_breakdown.filter_us;
|
|
||||||
dflash_state->t_accept_append_window_alloc_us += append_breakdown.window_alloc_us;
|
|
||||||
dflash_state->t_accept_append_replace_us += append_breakdown.replace_us;
|
|
||||||
dflash_state->t_accept_append_keep_old_us += append_breakdown.keep_old_us;
|
|
||||||
dflash_state->t_accept_append_new_rows_us += append_breakdown.new_rows_us;
|
|
||||||
dflash_state->t_accept_append_commit_detail_us += append_breakdown.commit_us;
|
|
||||||
dflash_state->t_accept_append_log_us += append_breakdown.log_us;
|
|
||||||
dflash_state->n_accept_append_calls++;
|
|
||||||
dflash_state->n_accept_append_rows += (size_t) batch.n_tokens;
|
|
||||||
if (append_breakdown.replace_call) {
|
|
||||||
dflash_state->n_accept_append_replace_calls++;
|
|
||||||
} else {
|
|
||||||
dflash_state->n_accept_append_slide_calls++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -4076,23 +4076,6 @@ void server_context::speculative_decoding_accept() {
|
|||||||
slot.sampled = ids.back(); // last accepted token
|
slot.sampled = ids.back(); // last accepted token
|
||||||
slot.n_past = slot.cache_tokens.n_tokens();
|
slot.n_past = slot.cache_tokens.n_tokens();
|
||||||
|
|
||||||
const common_speculative_type spec_type_used = common_speculative_current_type(slot.spec);
|
|
||||||
const bool any_rejected = (ids.size() - 1) < n_draft;
|
|
||||||
const common_speculative_checkpoint * ckpt = common_speculative_get_checkpoint(slot.spec);
|
|
||||||
const bool will_restore = any_rejected && ckpt != nullptr && ckpt->valid;
|
|
||||||
|
|
||||||
if (server_speculative_uses_target_features(slot.params.speculative) && !accepted_output_indices.empty()) {
|
|
||||||
llama_dflash_contract_log_accept(
|
|
||||||
slot.id,
|
|
||||||
spec_type_used == COMMON_SPECULATIVE_TYPE_DFLASH,
|
|
||||||
will_restore ? "restore" : "direct",
|
|
||||||
any_rejected,
|
|
||||||
n_draft,
|
|
||||||
ids.size(),
|
|
||||||
spec_pos_base,
|
|
||||||
accepted_output_indices);
|
|
||||||
}
|
|
||||||
|
|
||||||
common_speculative_commit(
|
common_speculative_commit(
|
||||||
slot.spec,
|
slot.spec,
|
||||||
ctx,
|
ctx,
|
||||||
|
|||||||
@ -7,23 +7,23 @@
|
|||||||
ggml_cgraph * llm_build_context::build_dflash_kv_workspace() {
|
ggml_cgraph * llm_build_context::build_dflash_kv_workspace() {
|
||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
||||||
const int64_t ctx_len = lctx.dflash_visible_cross_ctx > 0
|
const int64_t ctx_len = lctx.dflash.visible_cross_ctx > 0
|
||||||
? (int64_t) lctx.dflash_visible_cross_ctx
|
? (int64_t) lctx.dflash.visible_cross_ctx
|
||||||
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
||||||
const int32_t cache_rows = std::clamp(lctx.dflash_kv_cache_view_n_filled, 0, (int32_t) ctx_len);
|
const int32_t cache_rows = std::clamp(lctx.dflash.kv.cache_view_n_filled, 0, (int32_t) ctx_len);
|
||||||
const int32_t cache_write_pos = ctx_len > 0
|
const int32_t cache_write_pos = ctx_len > 0
|
||||||
? ((lctx.dflash_kv_cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
? ((lctx.dflash.kv.cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
||||||
: 0;
|
: 0;
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
||||||
GGML_ASSERT(lctx.ensure_dflash_kv_cache_tensors((int32_t) ctx_len));
|
GGML_ASSERT(lctx.ensure_dflash_kv_cache_tensors((int32_t) ctx_len));
|
||||||
GGML_ASSERT((int32_t) lctx.dflash_k_ctx_workspace.size() == n_layer);
|
GGML_ASSERT((int32_t) lctx.dflash.kv.k_ctx_workspace.size() == n_layer);
|
||||||
GGML_ASSERT((int32_t) lctx.dflash_v_ctx_workspace.size() == n_layer);
|
GGML_ASSERT((int32_t) lctx.dflash.kv.v_ctx_workspace.size() == n_layer);
|
||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(1, ctx_len)) + 16 * n_layer, false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(1, ctx_len)) + 16 * n_layer, false);
|
||||||
|
|
||||||
auto build_ordered_cache_view = [&](ggml_tensor * cache) -> ggml_tensor * {
|
auto build_ordered_cache_view = [&](ggml_tensor * cache) -> ggml_tensor * {
|
||||||
if (!lctx.dflash_kv_cache_view_valid || cache_rows <= 0) {
|
if (!lctx.dflash.kv.cache_view_valid || cache_rows <= 0) {
|
||||||
return cache;
|
return cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -67,11 +67,11 @@ ggml_cgraph * llm_build_context::build_dflash_kv_workspace() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_k_ctx_cache.size());
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.k_ctx_cache.size());
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_v_ctx_cache.size());
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.v_ctx_cache.size());
|
||||||
|
|
||||||
ggml_tensor * Kordered = build_ordered_cache_view(lctx.dflash_k_ctx_cache[(size_t) il]);
|
ggml_tensor * Kordered = build_ordered_cache_view(lctx.dflash.kv.k_ctx_cache[(size_t) il]);
|
||||||
ggml_tensor * Vordered = build_ordered_cache_view(lctx.dflash_v_ctx_cache[(size_t) il]);
|
ggml_tensor * Vordered = build_ordered_cache_view(lctx.dflash.kv.v_ctx_cache[(size_t) il]);
|
||||||
cb(Kordered, "dflash_workspace_k_ctx_view", il);
|
cb(Kordered, "dflash_workspace_k_ctx_view", il);
|
||||||
cb(Vordered, "dflash_workspace_v_ctx_view", il);
|
cb(Vordered, "dflash_workspace_v_ctx_view", il);
|
||||||
|
|
||||||
@ -80,19 +80,19 @@ ggml_cgraph * llm_build_context::build_dflash_kv_workspace() {
|
|||||||
cb(Kworkspace, "dflash_workspace_k_perm_cont", il);
|
cb(Kworkspace, "dflash_workspace_k_perm_cont", il);
|
||||||
cb(Vworkspace, "dflash_workspace_v_perm_cont", il);
|
cb(Vworkspace, "dflash_workspace_v_perm_cont", il);
|
||||||
|
|
||||||
ggml_tensor * Kdst = ggml_view_3d(ctx0, lctx.dflash_k_ctx_workspace[(size_t) il],
|
ggml_tensor * Kdst = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_workspace[(size_t) il],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->ne[0],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->ne[0],
|
||||||
ctx_len,
|
ctx_len,
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->ne[2],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->ne[2],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->nb[1],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->nb[1],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->nb[2],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
ggml_tensor * Vdst = ggml_view_3d(ctx0, lctx.dflash_v_ctx_workspace[(size_t) il],
|
ggml_tensor * Vdst = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_workspace[(size_t) il],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->ne[0],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->ne[0],
|
||||||
ctx_len,
|
ctx_len,
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->ne[2],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->ne[2],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->nb[1],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->nb[1],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->nb[2],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
|
|
||||||
ggml_tensor * Kstore = ggml_cpy(ctx0, Kworkspace, Kdst);
|
ggml_tensor * Kstore = ggml_cpy(ctx0, Kworkspace, Kdst);
|
||||||
@ -110,11 +110,11 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
|||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
||||||
const int64_t n_target_features = hparams.dflash_n_target_features;
|
const int64_t n_target_features = hparams.dflash_n_target_features;
|
||||||
const int64_t ctx_len = lctx.dflash_visible_cross_ctx > 0
|
const int64_t ctx_len = lctx.dflash.visible_cross_ctx > 0
|
||||||
? (int64_t) lctx.dflash_visible_cross_ctx
|
? (int64_t) lctx.dflash.visible_cross_ctx
|
||||||
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
||||||
const int64_t update_rows = std::max<int64_t>(1, lctx.dflash_kv_cache_update_rows > 0 ? lctx.dflash_kv_cache_update_rows : ctx_len);
|
const int64_t update_rows = std::max<int64_t>(1, lctx.dflash.kv.cache_update_rows > 0 ? lctx.dflash.kv.cache_update_rows : ctx_len);
|
||||||
const int32_t write_pos = lctx.dflash_kv_cache_write_pos;
|
const int32_t write_pos = lctx.dflash.kv.cache_write_pos;
|
||||||
|
|
||||||
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
GGML_ASSERT(n_embd_head_k == n_embd_head_v);
|
||||||
GGML_ASSERT(n_target_features > 0);
|
GGML_ASSERT(n_target_features > 0);
|
||||||
@ -124,21 +124,21 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(1, update_rows)) + 24 * n_layer, false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes((int) std::max<int64_t>(1, update_rows)) + 24 * n_layer, false);
|
||||||
|
|
||||||
lctx.dflash_kv_input_target_features = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_target_features, update_rows);
|
lctx.dflash.kv.cache_input_target_features = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_target_features, update_rows);
|
||||||
ggml_set_input(lctx.dflash_kv_input_target_features);
|
ggml_set_input(lctx.dflash.kv.cache_input_target_features);
|
||||||
cb(lctx.dflash_kv_input_target_features, "dflash_kv_input_target_features", -1);
|
cb(lctx.dflash.kv.cache_input_target_features, "dflash_kv_input_target_features", -1);
|
||||||
|
|
||||||
lctx.dflash_kv_input_pos_ctx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, update_rows);
|
lctx.dflash.kv.cache_input_pos_ctx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, update_rows);
|
||||||
ggml_set_input(lctx.dflash_kv_input_pos_ctx);
|
ggml_set_input(lctx.dflash.kv.cache_input_pos_ctx);
|
||||||
cb(lctx.dflash_kv_input_pos_ctx, "dflash_kv_input_pos_ctx", -1);
|
cb(lctx.dflash.kv.cache_input_pos_ctx, "dflash_kv_input_pos_ctx", -1);
|
||||||
|
|
||||||
ggml_tensor * fused_target = llm_build_lora_mm(lctx, ctx0, model.dflash_fc, lctx.dflash_kv_input_target_features);
|
ggml_tensor * fused_target = llm_build_lora_mm(lctx, ctx0, model.dflash_fc, lctx.dflash.kv.cache_input_target_features);
|
||||||
fused_target = llm_build_norm(ctx0, fused_target, hparams, model.dflash_hidden_norm, nullptr, LLM_NORM_RMS, cb, -1);
|
fused_target = llm_build_norm(ctx0, fused_target, hparams, model.dflash_hidden_norm, nullptr, LLM_NORM_RMS, cb, -1);
|
||||||
cb(fused_target, "dflash_kv_fused_target", -1);
|
cb(fused_target, "dflash_kv_fused_target", -1);
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_k_ctx_cache.size());
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.k_ctx_cache.size());
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_v_ctx_cache.size());
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.v_ctx_cache.size());
|
||||||
|
|
||||||
ggml_tensor * Kcur_ctx_proj = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, fused_target);
|
ggml_tensor * Kcur_ctx_proj = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, fused_target);
|
||||||
cb(Kcur_ctx_proj, "dflash_kv_k_proj", il);
|
cb(Kcur_ctx_proj, "dflash_kv_k_proj", il);
|
||||||
@ -146,7 +146,7 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
|||||||
ggml_tensor * Kcur_ctx = ggml_reshape_3d(ctx0, Kcur_ctx_proj, n_embd_head_k, n_head_kv, update_rows);
|
ggml_tensor * Kcur_ctx = ggml_reshape_3d(ctx0, Kcur_ctx_proj, n_embd_head_k, n_head_kv, update_rows);
|
||||||
Kcur_ctx = llm_build_norm(ctx0, Kcur_ctx, hparams, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, cb, il);
|
Kcur_ctx = llm_build_norm(ctx0, Kcur_ctx, hparams, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, cb, il);
|
||||||
cb(Kcur_ctx, "dflash_kv_k_norm", il);
|
cb(Kcur_ctx, "dflash_kv_k_norm", il);
|
||||||
Kcur_ctx = ggml_rope_ext(ctx0, Kcur_ctx, lctx.dflash_kv_input_pos_ctx, nullptr,
|
Kcur_ctx = ggml_rope_ext(ctx0, Kcur_ctx, lctx.dflash.kv.cache_input_pos_ctx, nullptr,
|
||||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||||
cb(Kcur_ctx, "dflash_kv_k_rope", il);
|
cb(Kcur_ctx, "dflash_kv_k_rope", il);
|
||||||
@ -177,20 +177,20 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
|||||||
Vcur_ctx->nb[1],
|
Vcur_ctx->nb[1],
|
||||||
Vcur_ctx->nb[2],
|
Vcur_ctx->nb[2],
|
||||||
0);
|
0);
|
||||||
ggml_tensor * Kdst_first = ggml_view_3d(ctx0, lctx.dflash_k_ctx_cache[(size_t) il],
|
ggml_tensor * Kdst_first = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_cache[(size_t) il],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->ne[0],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->ne[0],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->ne[1],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->ne[1],
|
||||||
first_rows,
|
first_rows,
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->nb[1],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->nb[1],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->nb[2],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->nb[2],
|
||||||
(size_t) write_pos * lctx.dflash_k_ctx_cache[(size_t) il]->nb[2]);
|
(size_t) write_pos * lctx.dflash.kv.k_ctx_cache[(size_t) il]->nb[2]);
|
||||||
ggml_tensor * Vdst_first = ggml_view_3d(ctx0, lctx.dflash_v_ctx_cache[(size_t) il],
|
ggml_tensor * Vdst_first = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_cache[(size_t) il],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->ne[0],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->ne[0],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->ne[1],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->ne[1],
|
||||||
first_rows,
|
first_rows,
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->nb[1],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->nb[1],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->nb[2],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->nb[2],
|
||||||
(size_t) write_pos * lctx.dflash_v_ctx_cache[(size_t) il]->nb[2]);
|
(size_t) write_pos * lctx.dflash.kv.v_ctx_cache[(size_t) il]->nb[2]);
|
||||||
|
|
||||||
ggml_tensor * Kstore_first = ggml_cpy(ctx0, Ksrc_first, Kdst_first);
|
ggml_tensor * Kstore_first = ggml_cpy(ctx0, Ksrc_first, Kdst_first);
|
||||||
cb(Kstore_first, "dflash_kv_k_store", il);
|
cb(Kstore_first, "dflash_kv_k_store", il);
|
||||||
@ -216,19 +216,19 @@ ggml_cgraph * llm_build_context::build_dflash_kv_cache() {
|
|||||||
Vcur_ctx->nb[1],
|
Vcur_ctx->nb[1],
|
||||||
Vcur_ctx->nb[2],
|
Vcur_ctx->nb[2],
|
||||||
(size_t) first_rows * Vcur_ctx->nb[2]);
|
(size_t) first_rows * Vcur_ctx->nb[2]);
|
||||||
ggml_tensor * Kdst_second = ggml_view_3d(ctx0, lctx.dflash_k_ctx_cache[(size_t) il],
|
ggml_tensor * Kdst_second = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_cache[(size_t) il],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->ne[0],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->ne[0],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->ne[1],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->ne[1],
|
||||||
second_rows,
|
second_rows,
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->nb[1],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->nb[1],
|
||||||
lctx.dflash_k_ctx_cache[(size_t) il]->nb[2],
|
lctx.dflash.kv.k_ctx_cache[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
ggml_tensor * Vdst_second = ggml_view_3d(ctx0, lctx.dflash_v_ctx_cache[(size_t) il],
|
ggml_tensor * Vdst_second = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_cache[(size_t) il],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->ne[0],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->ne[0],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->ne[1],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->ne[1],
|
||||||
second_rows,
|
second_rows,
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->nb[1],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->nb[1],
|
||||||
lctx.dflash_v_ctx_cache[(size_t) il]->nb[2],
|
lctx.dflash.kv.v_ctx_cache[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
|
|
||||||
ggml_tensor * Kstore_second = ggml_cpy(ctx0, Ksrc_second, Kdst_second);
|
ggml_tensor * Kstore_second = ggml_cpy(ctx0, Ksrc_second, Kdst_second);
|
||||||
@ -248,12 +248,11 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
|||||||
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
const int64_t n_embd_head_k = hparams.n_embd_head_k(0);
|
||||||
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
const int64_t n_embd_head_v = hparams.n_embd_head_v(0);
|
||||||
const int64_t n_target_features = hparams.dflash_n_target_features;
|
const int64_t n_target_features = hparams.dflash_n_target_features;
|
||||||
auto & profile = lctx.dflash_profile;
|
const int64_t ctx_len = lctx.dflash.visible_cross_ctx > 0
|
||||||
const int64_t ctx_len = lctx.dflash_visible_cross_ctx > 0
|
? (int64_t) lctx.dflash.visible_cross_ctx
|
||||||
? (int64_t) lctx.dflash_visible_cross_ctx
|
|
||||||
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
: std::max<int64_t>(1, (int64_t) cparams.n_ctx - (int64_t) hparams.dflash_block_size);
|
||||||
const int32_t cache_write_pos = ctx_len > 0
|
const int32_t cache_write_pos = ctx_len > 0
|
||||||
? ((lctx.dflash_kv_cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
? ((lctx.dflash.kv.cache_view_write_pos % (int32_t) ctx_len) + (int32_t) ctx_len) % (int32_t) ctx_len
|
||||||
: 0;
|
: 0;
|
||||||
const int64_t n_kv_total = GGML_PAD(ctx_len + n_tokens, flash_attn ? 256 : 32);
|
const int64_t n_kv_total = GGML_PAD(ctx_len + n_tokens, flash_attn ? 256 : 32);
|
||||||
const int64_t n_kv_pad = n_kv_total - (ctx_len + n_tokens);
|
const int64_t n_kv_pad = n_kv_total - (ctx_len + n_tokens);
|
||||||
@ -273,21 +272,21 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.inp_dflash_kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
lctx.dflash.inputs.kq_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
lctx.dflash_kq_mask_tensor = lctx.inp_dflash_kq_mask;
|
lctx.dflash.kv.kq_mask_tensor = lctx.dflash.inputs.kq_mask;
|
||||||
ggml_set_input(lctx.inp_dflash_kq_mask);
|
ggml_set_input(lctx.dflash.inputs.kq_mask);
|
||||||
cb(lctx.inp_dflash_kq_mask, "dflash_kq_mask", -1);
|
cb(lctx.dflash.inputs.kq_mask, "dflash_kq_mask", -1);
|
||||||
|
|
||||||
ggml_tensor * dflash_kq_mask_full = flash_attn ? ggml_cast(ctx0, lctx.inp_dflash_kq_mask, GGML_TYPE_F16) : lctx.inp_dflash_kq_mask;
|
ggml_tensor * dflash_kq_mask_full = flash_attn ? ggml_cast(ctx0, lctx.dflash.inputs.kq_mask, GGML_TYPE_F16) : lctx.dflash.inputs.kq_mask;
|
||||||
ggml_tensor * dflash_kq_mask_swa = nullptr;
|
ggml_tensor * dflash_kq_mask_swa = nullptr;
|
||||||
lctx.inp_dflash_kq_mask_swa = nullptr;
|
lctx.dflash.inputs.kq_mask_swa = nullptr;
|
||||||
lctx.dflash_kq_mask_swa_tensor = nullptr;
|
lctx.dflash.kv.kq_mask_swa_tensor = nullptr;
|
||||||
if (have_swa_layers && hparams.n_swa > 0) {
|
if (have_swa_layers && hparams.n_swa > 0) {
|
||||||
lctx.inp_dflash_kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
lctx.dflash.inputs.kq_mask_swa = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv_total, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD));
|
||||||
lctx.dflash_kq_mask_swa_tensor = lctx.inp_dflash_kq_mask_swa;
|
lctx.dflash.kv.kq_mask_swa_tensor = lctx.dflash.inputs.kq_mask_swa;
|
||||||
ggml_set_input(lctx.inp_dflash_kq_mask_swa);
|
ggml_set_input(lctx.dflash.inputs.kq_mask_swa);
|
||||||
cb(lctx.inp_dflash_kq_mask_swa, "dflash_kq_mask_swa", -1);
|
cb(lctx.dflash.inputs.kq_mask_swa, "dflash_kq_mask_swa", -1);
|
||||||
dflash_kq_mask_swa = flash_attn ? ggml_cast(ctx0, lctx.inp_dflash_kq_mask_swa, GGML_TYPE_F16) : lctx.inp_dflash_kq_mask_swa;
|
dflash_kq_mask_swa = flash_attn ? ggml_cast(ctx0, lctx.dflash.inputs.kq_mask_swa, GGML_TYPE_F16) : lctx.dflash.inputs.kq_mask_swa;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * tok_embd = model.tok_embd;
|
ggml_tensor * tok_embd = model.tok_embd;
|
||||||
@ -328,25 +327,24 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
|||||||
Vcur_noise = ggml_reshape_3d(ctx0, Vcur_noise, n_embd_head_v, n_head_kv, n_tokens);
|
Vcur_noise = ggml_reshape_3d(ctx0, Vcur_noise, n_embd_head_v, n_head_kv, n_tokens);
|
||||||
cb(Vcur_noise, "Vcur_noise", il);
|
cb(Vcur_noise, "Vcur_noise", il);
|
||||||
|
|
||||||
const int64_t t_cache_read_us = ggml_time_us();
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.k_ctx_workspace.size());
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_k_ctx_workspace.size());
|
GGML_ASSERT((size_t) il < lctx.dflash.kv.v_ctx_workspace.size());
|
||||||
GGML_ASSERT((size_t) il < lctx.dflash_v_ctx_workspace.size());
|
GGML_ASSERT(lctx.dflash.kv.k_ctx_workspace[(size_t) il] != nullptr);
|
||||||
GGML_ASSERT(lctx.dflash_k_ctx_workspace[(size_t) il] != nullptr);
|
GGML_ASSERT(lctx.dflash.kv.v_ctx_workspace[(size_t) il] != nullptr);
|
||||||
GGML_ASSERT(lctx.dflash_v_ctx_workspace[(size_t) il] != nullptr);
|
|
||||||
|
|
||||||
ggml_tensor * Kcur_ctx = ggml_view_3d(ctx0, lctx.dflash_k_ctx_workspace[(size_t) il],
|
ggml_tensor * Kcur_ctx = ggml_view_3d(ctx0, lctx.dflash.kv.k_ctx_workspace[(size_t) il],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->ne[0],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->ne[0],
|
||||||
ctx_len,
|
ctx_len,
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->ne[2],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->ne[2],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->nb[1],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->nb[1],
|
||||||
lctx.dflash_k_ctx_workspace[(size_t) il]->nb[2],
|
lctx.dflash.kv.k_ctx_workspace[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
ggml_tensor * Vcur_ctx = ggml_view_3d(ctx0, lctx.dflash_v_ctx_workspace[(size_t) il],
|
ggml_tensor * Vcur_ctx = ggml_view_3d(ctx0, lctx.dflash.kv.v_ctx_workspace[(size_t) il],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->ne[0],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->ne[0],
|
||||||
ctx_len,
|
ctx_len,
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->ne[2],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->ne[2],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->nb[1],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->nb[1],
|
||||||
lctx.dflash_v_ctx_workspace[(size_t) il]->nb[2],
|
lctx.dflash.kv.v_ctx_workspace[(size_t) il]->nb[2],
|
||||||
0);
|
0);
|
||||||
cb(Kcur_ctx, "Kcur_ctx_workspace", il);
|
cb(Kcur_ctx, "Kcur_ctx_workspace", il);
|
||||||
cb(Vcur_ctx, "Vcur_ctx_workspace", il);
|
cb(Vcur_ctx, "Vcur_ctx_workspace", il);
|
||||||
@ -368,9 +366,6 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
|||||||
cb(Vcur, "dflash_main_v_pad", il);
|
cb(Vcur, "dflash_main_v_pad", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
profile.graph_kv_cache_read_concat_pad_us += (uint64_t) (ggml_time_us() - t_cache_read_us);
|
|
||||||
profile.graph_kv_cache_read_concat_pad_calls++;
|
|
||||||
profile.graph_kv_cache_cached_bytes += ggml_nbytes(lctx.dflash_k_ctx_cache[(size_t) il]) + ggml_nbytes(lctx.dflash_v_ctx_cache[(size_t) il]);
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
@ -434,11 +429,11 @@ ggml_cgraph * llm_build_context::build_dflash() {
|
|||||||
cb(result, "result_output", -1);
|
cb(result, "result_output", -1);
|
||||||
ggml_build_forward_expand(gf, result);
|
ggml_build_forward_expand(gf, result);
|
||||||
|
|
||||||
lctx.dflash_draft_tokens_tensor = nullptr;
|
lctx.dflash.draft_tokens_tensor = nullptr;
|
||||||
ggml_tensor * draft_tokens = ggml_argmax(ctx0, result);
|
ggml_tensor * draft_tokens = ggml_argmax(ctx0, result);
|
||||||
ggml_set_name(draft_tokens, "draft_argmax");
|
ggml_set_name(draft_tokens, "draft_argmax");
|
||||||
ggml_build_forward_expand(gf, draft_tokens);
|
ggml_build_forward_expand(gf, draft_tokens);
|
||||||
lctx.dflash_draft_tokens_tensor = draft_tokens;
|
lctx.dflash.draft_tokens_tensor = draft_tokens;
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -365,82 +365,15 @@ struct llama_context {
|
|||||||
std::vector<float> feature_view_buffer;
|
std::vector<float> feature_view_buffer;
|
||||||
input_state inputs;
|
input_state inputs;
|
||||||
int32_t visible_cross_ctx = 0;
|
int32_t visible_cross_ctx = 0;
|
||||||
llama_dflash_profile_stats profile;
|
|
||||||
|
// Argmax token IDs from the DFlash draft graph, computed via GPU argmax.
|
||||||
|
// Populated in llama_decode_internal after graph compute.
|
||||||
|
std::vector<llama_token> draft_tokens;
|
||||||
|
struct ggml_tensor * draft_tokens_tensor = nullptr;
|
||||||
};
|
};
|
||||||
dflash_runtime dflash;
|
dflash_runtime dflash;
|
||||||
using dflash_capture_state = dflash_runtime::capture_state;
|
using dflash_capture_state = dflash_runtime::capture_state;
|
||||||
|
|
||||||
const float * & dflash_target_features = dflash.target.features;
|
|
||||||
size_t & dflash_target_features_n_floats = dflash.target.features_n_floats;
|
|
||||||
int32_t & dflash_target_features_n_rows = dflash.target.features_n_rows;
|
|
||||||
const float * & dflash_target_append_features = dflash.target.append_features;
|
|
||||||
size_t & dflash_target_append_features_n_floats = dflash.target.append_features_n_floats;
|
|
||||||
int32_t & dflash_target_append_features_n_rows = dflash.target.append_features_n_rows;
|
|
||||||
const llama_pos * & dflash_target_positions = dflash.target.positions;
|
|
||||||
size_t & dflash_target_positions_n = dflash.target.positions_n;
|
|
||||||
uint64_t & dflash_target_window_version = dflash.target.version;
|
|
||||||
int32_t & dflash_target_window_keep_rows = dflash.target.keep_rows;
|
|
||||||
int32_t & dflash_target_window_append_rows = dflash.target.append_rows;
|
|
||||||
bool & dflash_target_window_replace = dflash.target.replace;
|
|
||||||
std::vector<float> & dflash_target_features_owned = dflash.target.features_owned;
|
|
||||||
std::vector<float> & dflash_target_append_features_owned = dflash.target.append_features_owned;
|
|
||||||
std::vector<llama_pos> & dflash_target_positions_owned = dflash.target.positions_owned;
|
|
||||||
std::vector<float> & dflash_target_features_padded = dflash.target.features_padded;
|
|
||||||
std::vector<float> & dflash_feature_view_buffer = dflash.feature_view_buffer;
|
|
||||||
std::vector<llama_pos> & dflash_pos_ctx_data = dflash.target.pos_ctx_data;
|
|
||||||
std::vector<float> & dflash_kq_mask_data = dflash.target.kq_mask_data;
|
|
||||||
std::vector<float> & dflash_kq_mask_swa_data = dflash.target.kq_mask_swa_data;
|
|
||||||
int32_t & dflash_visible_cross_ctx = dflash.visible_cross_ctx;
|
|
||||||
std::vector<struct ggml_tensor *> & dflash_k_ctx_cache = dflash.kv.k_ctx_cache;
|
|
||||||
std::vector<struct ggml_tensor *> & dflash_v_ctx_cache = dflash.kv.v_ctx_cache;
|
|
||||||
|
|
||||||
// Argmax token IDs from the DFlash draft graph, computed via GPU argmax.
|
|
||||||
// Populated in llama_decode_internal after graph compute.
|
|
||||||
std::vector<llama_token> dflash_draft_tokens;
|
|
||||||
struct ggml_tensor * dflash_draft_tokens_tensor = nullptr;
|
|
||||||
|
|
||||||
std::vector<struct ggml_tensor *> & dflash_k_ctx_workspace = dflash.kv.k_ctx_workspace;
|
|
||||||
std::vector<struct ggml_tensor *> & dflash_v_ctx_workspace = dflash.kv.v_ctx_workspace;
|
|
||||||
struct ggml_context * & dflash_cache_ctx = dflash.kv.cache_ctx;
|
|
||||||
std::vector<ggml_backend_buffer_t> & dflash_cache_bufs = dflash.kv.cache_bufs;
|
|
||||||
int32_t & dflash_kv_cache_write_pos = dflash.kv.cache_write_pos;
|
|
||||||
int32_t & dflash_kv_cache_n_filled = dflash.kv.cache_n_filled;
|
|
||||||
int32_t & dflash_kv_cache_update_rows = dflash.kv.cache_update_rows;
|
|
||||||
int32_t & dflash_kv_cache_reserved_rows = dflash.kv.cache_reserved_rows;
|
|
||||||
int32_t & dflash_kv_cache_view_write_pos = dflash.kv.cache_view_write_pos;
|
|
||||||
int32_t & dflash_kv_cache_view_n_filled = dflash.kv.cache_view_n_filled;
|
|
||||||
uint64_t & dflash_kv_cache_applied_window_version = dflash.kv.cache_applied_window_version;
|
|
||||||
bool & dflash_kv_cache_valid = dflash.kv.cache_valid;
|
|
||||||
bool & dflash_kv_cache_view_valid = dflash.kv.cache_view_valid;
|
|
||||||
int32_t & dflash_kv_workspace_write_pos = dflash.kv.workspace_write_pos;
|
|
||||||
int32_t & dflash_kv_workspace_n_filled = dflash.kv.workspace_n_filled;
|
|
||||||
int32_t & dflash_kv_workspace_reserved_rows = dflash.kv.workspace_reserved_rows;
|
|
||||||
int32_t & dflash_kv_workspace_token_capacity = dflash.kv.workspace_token_capacity;
|
|
||||||
int32_t & dflash_kv_workspace_n_kv_total = dflash.kv.workspace_n_kv_total;
|
|
||||||
uint64_t & dflash_kv_workspace_applied_window_version = dflash.kv.workspace_applied_window_version;
|
|
||||||
bool & dflash_kv_workspace_valid = dflash.kv.workspace_valid;
|
|
||||||
bool & dflash_kv_workspace_sync_pending = dflash.kv.workspace_sync_pending;
|
|
||||||
std::vector<uint8_t> & dflash_buf_compute_meta = dflash.kv.cache_compute_meta;
|
|
||||||
std::vector<uint8_t> & dflash_workspace_buf_compute_meta = dflash.kv.workspace_compute_meta;
|
|
||||||
ggml_backend_sched_t & dflash_sched = dflash.kv.cache_sched;
|
|
||||||
ggml_backend_sched_t & dflash_workspace_sched = dflash.kv.workspace_sched;
|
|
||||||
ggml_cgraph * & dflash_kv_graph = dflash.kv.cache_graph;
|
|
||||||
ggml_cgraph * & dflash_kv_workspace_graph = dflash.kv.workspace_graph;
|
|
||||||
int32_t & dflash_kv_graph_rows = dflash.kv.cache_graph_rows;
|
|
||||||
int32_t & dflash_kv_graph_write_pos = dflash.kv.cache_graph_write_pos;
|
|
||||||
int32_t & dflash_kv_workspace_graph_rows = dflash.kv.workspace_graph_rows;
|
|
||||||
int32_t & dflash_kv_workspace_graph_write_pos = dflash.kv.workspace_graph_write_pos;
|
|
||||||
struct ggml_tensor * & dflash_kv_input_target_features = dflash.kv.cache_input_target_features;
|
|
||||||
struct ggml_tensor * & dflash_kv_input_pos_ctx = dflash.kv.cache_input_pos_ctx;
|
|
||||||
struct ggml_tensor * & dflash_kq_mask_tensor = dflash.kv.kq_mask_tensor;
|
|
||||||
struct ggml_tensor * & dflash_kq_mask_swa_tensor = dflash.kv.kq_mask_swa_tensor;
|
|
||||||
std::unique_ptr<dflash_capture_state> & dflash_capture = dflash.capture;
|
|
||||||
llama_dflash_profile_stats & dflash_profile = dflash.profile;
|
|
||||||
struct ggml_tensor * & inp_dflash_target_features = dflash.inputs.target_features;
|
|
||||||
struct ggml_tensor * & inp_dflash_pos_ctx = dflash.inputs.pos_ctx;
|
|
||||||
struct ggml_tensor * & inp_dflash_kq_mask = dflash.inputs.kq_mask;
|
|
||||||
struct ggml_tensor * & inp_dflash_kq_mask_swa = dflash.inputs.kq_mask_swa;
|
|
||||||
|
|
||||||
// input tensors
|
// input tensors
|
||||||
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
struct ggml_tensor * inp_tokens; // I32 [n_batch]
|
||||||
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
|
||||||
|
|||||||
@ -1,340 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <cstring>
|
|
||||||
|
|
||||||
enum llama_dflash_kv_node_kind {
|
|
||||||
LLAMA_DFLASH_KV_NODE_NONE = 0,
|
|
||||||
LLAMA_DFLASH_KV_NODE_FUSED_TARGET,
|
|
||||||
LLAMA_DFLASH_KV_NODE_K_PROJ,
|
|
||||||
LLAMA_DFLASH_KV_NODE_K_NORM,
|
|
||||||
LLAMA_DFLASH_KV_NODE_K_ROPE,
|
|
||||||
LLAMA_DFLASH_KV_NODE_V_PROJ,
|
|
||||||
LLAMA_DFLASH_KV_NODE_K_STORE,
|
|
||||||
LLAMA_DFLASH_KV_NODE_V_STORE,
|
|
||||||
};
|
|
||||||
|
|
||||||
enum llama_dflash_main_node_kind {
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_NONE = 0,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_QCUR,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_K_DRAFT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_V_DRAFT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_K_CTX_VIEW,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_V_CTX_VIEW,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_K_CONCAT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_V_CONCAT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_K_PAD,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_V_PAD,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_K_PERM_CONT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_V_PERM_CONT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_FLASH_ATTN,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_ATTN_OUT,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_FFN,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_RESULT_ROWS,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_RESULT_NORM,
|
|
||||||
LLAMA_DFLASH_MAIN_NODE_RESULT,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_dflash_kv_node_profiler {
|
|
||||||
llama_dflash_profile_stats * profile = nullptr;
|
|
||||||
int64_t t_start_us = 0;
|
|
||||||
llama_dflash_kv_node_kind active_kind = LLAMA_DFLASH_KV_NODE_NONE;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_dflash_main_node_profiler {
|
|
||||||
llama_dflash_profile_stats * profile = nullptr;
|
|
||||||
ggml_backend_sched_eval_callback prev_callback = nullptr;
|
|
||||||
void * prev_user_data = nullptr;
|
|
||||||
bool prev_active = false;
|
|
||||||
int64_t t_start_us = 0;
|
|
||||||
llama_dflash_main_node_kind active_kind = LLAMA_DFLASH_MAIN_NODE_NONE;
|
|
||||||
};
|
|
||||||
|
|
||||||
static inline bool llama_dflash_tensor_name_has_prefix(const struct ggml_tensor * tensor, const char * prefix) {
|
|
||||||
if (tensor == nullptr || prefix == nullptr || prefix[0] == '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::strncmp(tensor->name, prefix, std::strlen(prefix)) == 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool llama_dflash_tensor_name_matches_label(const struct ggml_tensor * tensor, const char * label) {
|
|
||||||
if (!llama_dflash_tensor_name_has_prefix(tensor, label)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t label_len = std::strlen(label);
|
|
||||||
const char next = tensor->name[label_len];
|
|
||||||
return next == '\0' || next == '-';
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline llama_dflash_kv_node_kind llama_dflash_kv_node_kind_from_tensor(const struct ggml_tensor * tensor) {
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_fused_target")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_FUSED_TARGET;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_k_proj")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_K_PROJ;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_k_norm")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_K_NORM;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_k_rope")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_K_ROPE;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_v_proj")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_V_PROJ;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_k_store")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_K_STORE;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_kv_v_store")) {
|
|
||||||
return LLAMA_DFLASH_KV_NODE_V_STORE;
|
|
||||||
}
|
|
||||||
|
|
||||||
return LLAMA_DFLASH_KV_NODE_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void llama_dflash_kv_node_profile_add(
|
|
||||||
llama_dflash_profile_stats & profile,
|
|
||||||
llama_dflash_kv_node_kind kind,
|
|
||||||
uint64_t elapsed_us) {
|
|
||||||
switch (kind) {
|
|
||||||
case LLAMA_DFLASH_KV_NODE_FUSED_TARGET:
|
|
||||||
profile.graph_kv_node_fused_target_calls++;
|
|
||||||
profile.graph_kv_node_fused_target_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_K_PROJ:
|
|
||||||
profile.graph_kv_node_k_proj_calls++;
|
|
||||||
profile.graph_kv_node_k_proj_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_K_NORM:
|
|
||||||
profile.graph_kv_node_k_norm_calls++;
|
|
||||||
profile.graph_kv_node_k_norm_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_K_ROPE:
|
|
||||||
profile.graph_kv_node_k_rope_calls++;
|
|
||||||
profile.graph_kv_node_k_rope_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_V_PROJ:
|
|
||||||
profile.graph_kv_node_v_proj_calls++;
|
|
||||||
profile.graph_kv_node_v_proj_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_K_STORE:
|
|
||||||
profile.graph_kv_node_k_store_calls++;
|
|
||||||
profile.graph_kv_node_k_store_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_V_STORE:
|
|
||||||
profile.graph_kv_node_v_store_calls++;
|
|
||||||
profile.graph_kv_node_v_store_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_KV_NODE_NONE:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline llama_dflash_main_node_kind llama_dflash_main_node_kind_from_tensor(const struct ggml_tensor * tensor) {
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "Qcur")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_QCUR;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "Kcur_noise")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_K_DRAFT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "Vcur_noise")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_V_DRAFT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "Kcur_ctx_cache")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_K_CTX_VIEW;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "Vcur_ctx_cache")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_V_CTX_VIEW;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_k_concat")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_K_CONCAT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_v_concat")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_V_CONCAT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_k_pad")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_K_PAD;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_v_pad")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_V_PAD;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_k_perm_cont")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_K_PERM_CONT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "dflash_main_v_perm_cont")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_V_PERM_CONT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "flash_attn_reshaped")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_NONE;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_matches_label(tensor, "flash_attn")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_FLASH_ATTN;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "kqv_out")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_ATTN_OUT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_has_prefix(tensor, "ffn_out")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_FFN;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_matches_label(tensor, "result_output_rows")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_RESULT_ROWS;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_matches_label(tensor, "result_norm")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_RESULT_NORM;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_matches_label(tensor, "output")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_RESULT;
|
|
||||||
}
|
|
||||||
if (llama_dflash_tensor_name_matches_label(tensor, "result_output")) {
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_RESULT;
|
|
||||||
}
|
|
||||||
|
|
||||||
return LLAMA_DFLASH_MAIN_NODE_NONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void llama_dflash_main_node_profile_add(
|
|
||||||
llama_dflash_profile_stats & profile,
|
|
||||||
llama_dflash_main_node_kind kind,
|
|
||||||
uint64_t elapsed_us) {
|
|
||||||
switch (kind) {
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_QCUR:
|
|
||||||
profile.graph_main_node_qcur_calls++;
|
|
||||||
profile.graph_main_node_qcur_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_K_DRAFT:
|
|
||||||
profile.graph_main_node_k_draft_calls++;
|
|
||||||
profile.graph_main_node_k_draft_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_V_DRAFT:
|
|
||||||
profile.graph_main_node_v_draft_calls++;
|
|
||||||
profile.graph_main_node_v_draft_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_K_CTX_VIEW:
|
|
||||||
profile.graph_main_node_k_ctx_view_calls++;
|
|
||||||
profile.graph_main_node_k_ctx_view_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_V_CTX_VIEW:
|
|
||||||
profile.graph_main_node_v_ctx_view_calls++;
|
|
||||||
profile.graph_main_node_v_ctx_view_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_K_CONCAT:
|
|
||||||
profile.graph_main_node_k_concat_calls++;
|
|
||||||
profile.graph_main_node_k_concat_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_V_CONCAT:
|
|
||||||
profile.graph_main_node_v_concat_calls++;
|
|
||||||
profile.graph_main_node_v_concat_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_K_PAD:
|
|
||||||
profile.graph_main_node_k_pad_calls++;
|
|
||||||
profile.graph_main_node_k_pad_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_V_PAD:
|
|
||||||
profile.graph_main_node_v_pad_calls++;
|
|
||||||
profile.graph_main_node_v_pad_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_K_PERM_CONT:
|
|
||||||
profile.graph_main_node_k_perm_cont_calls++;
|
|
||||||
profile.graph_main_node_k_perm_cont_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_V_PERM_CONT:
|
|
||||||
profile.graph_main_node_v_perm_cont_calls++;
|
|
||||||
profile.graph_main_node_v_perm_cont_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_FLASH_ATTN:
|
|
||||||
profile.graph_main_node_flash_attn_calls++;
|
|
||||||
profile.graph_main_node_flash_attn_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_ATTN_OUT:
|
|
||||||
profile.graph_main_node_attn_out_calls++;
|
|
||||||
profile.graph_main_node_attn_out_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_FFN:
|
|
||||||
profile.graph_main_node_ffn_calls++;
|
|
||||||
profile.graph_main_node_ffn_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_RESULT_ROWS:
|
|
||||||
profile.graph_main_node_result_rows_calls++;
|
|
||||||
profile.graph_main_node_result_rows_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_RESULT_NORM:
|
|
||||||
profile.graph_main_node_result_norm_calls++;
|
|
||||||
profile.graph_main_node_result_norm_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_RESULT:
|
|
||||||
profile.graph_main_node_result_calls++;
|
|
||||||
profile.graph_main_node_result_us += elapsed_us;
|
|
||||||
break;
|
|
||||||
case LLAMA_DFLASH_MAIN_NODE_NONE:
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool llama_dflash_kv_node_eval_callback(struct ggml_tensor * tensor, bool ask, void * user_data) {
|
|
||||||
auto * profiler = static_cast<llama_dflash_kv_node_profiler *>(user_data);
|
|
||||||
if (profiler == nullptr || profiler->profile == nullptr) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const llama_dflash_kv_node_kind kind = llama_dflash_kv_node_kind_from_tensor(tensor);
|
|
||||||
if (ask) {
|
|
||||||
if (kind == LLAMA_DFLASH_KV_NODE_NONE) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler->active_kind = kind;
|
|
||||||
profiler->t_start_us = ggml_time_us();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (kind != LLAMA_DFLASH_KV_NODE_NONE && profiler->active_kind == kind && profiler->t_start_us > 0) {
|
|
||||||
llama_dflash_kv_node_profile_add(*profiler->profile, kind, (uint64_t) (ggml_time_us() - profiler->t_start_us));
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler->active_kind = LLAMA_DFLASH_KV_NODE_NONE;
|
|
||||||
profiler->t_start_us = 0;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline bool llama_dflash_main_node_eval_callback(struct ggml_tensor * tensor, bool ask, void * user_data) {
|
|
||||||
auto * profiler = static_cast<llama_dflash_main_node_profiler *>(user_data);
|
|
||||||
if (profiler == nullptr || profiler->profile == nullptr) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const llama_dflash_main_node_kind kind = llama_dflash_main_node_kind_from_tensor(tensor);
|
|
||||||
if (ask) {
|
|
||||||
profiler->prev_active = profiler->prev_callback != nullptr
|
|
||||||
? profiler->prev_callback(tensor, ask, profiler->prev_user_data)
|
|
||||||
: false;
|
|
||||||
|
|
||||||
if (kind == LLAMA_DFLASH_MAIN_NODE_NONE) {
|
|
||||||
profiler->active_kind = LLAMA_DFLASH_MAIN_NODE_NONE;
|
|
||||||
profiler->t_start_us = 0;
|
|
||||||
return profiler->prev_active;
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler->active_kind = kind;
|
|
||||||
profiler->t_start_us = ggml_time_us();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool prev_result = false;
|
|
||||||
if (profiler->prev_active && profiler->prev_callback != nullptr) {
|
|
||||||
prev_result = profiler->prev_callback(tensor, ask, profiler->prev_user_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
const bool tracked = kind != LLAMA_DFLASH_MAIN_NODE_NONE &&
|
|
||||||
profiler->active_kind == kind &&
|
|
||||||
profiler->t_start_us > 0;
|
|
||||||
if (tracked) {
|
|
||||||
llama_dflash_main_node_profile_add(*profiler->profile, kind, (uint64_t) (ggml_time_us() - profiler->t_start_us));
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler->prev_active = false;
|
|
||||||
profiler->active_kind = LLAMA_DFLASH_MAIN_NODE_NONE;
|
|
||||||
profiler->t_start_us = 0;
|
|
||||||
return prev_result || tracked;
|
|
||||||
}
|
|
||||||
@ -5,38 +5,22 @@
|
|||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
#include "llama-spec-features.h"
|
#include "llama-spec-features.h"
|
||||||
#include "llama-dflash-profile.h"
|
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstdlib>
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static bool llama_env_flag_enabled_local(const char * name) {
|
|
||||||
const char * env = std::getenv(name);
|
|
||||||
return env != nullptr && *env != '\0' &&
|
|
||||||
std::strcmp(env, "0") != 0 &&
|
|
||||||
std::strcmp(env, "false") != 0 &&
|
|
||||||
std::strcmp(env, "off") != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_dflash_stats_log_enabled() {
|
|
||||||
return llama_env_flag_enabled_local("IK_DFLASH_STATS_LOG");
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_sync_dflash_workspace_if_pending(struct llama_context & lctx) {
|
void llama_sync_dflash_workspace_if_pending(struct llama_context & lctx) {
|
||||||
if (!lctx.dflash_kv_workspace_sync_pending || lctx.dflash_workspace_sched == nullptr) {
|
if (!lctx.dflash.kv.workspace_sync_pending || lctx.dflash.kv.workspace_sched == nullptr) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_workspace_sync_us = ggml_time_us();
|
ggml_backend_sched_synchronize(lctx.dflash.kv.workspace_sched);
|
||||||
ggml_backend_sched_synchronize(lctx.dflash_workspace_sched);
|
lctx.dflash.kv.workspace_sync_pending = false;
|
||||||
lctx.dflash_profile.graph_kv_workspace_sync_us += (uint64_t) (ggml_time_us() - t_workspace_sync_us);
|
|
||||||
lctx.dflash_kv_workspace_sync_pending = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t llama_dflash_kv_cache_layer_buft(const llama_context & lctx, int32_t il) {
|
static ggml_backend_buffer_type_t llama_dflash_kv_cache_layer_buft(const llama_context & lctx, int32_t il) {
|
||||||
@ -86,36 +70,36 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
|||||||
const int64_t n_embd_head_v = model.hparams.n_embd_head_v(0);
|
const int64_t n_embd_head_v = model.hparams.n_embd_head_v(0);
|
||||||
const int64_t n_head_kv = model.hparams.n_head_kv();
|
const int64_t n_head_kv = model.hparams.n_head_kv();
|
||||||
|
|
||||||
if (dflash_cache_ctx != nullptr && !dflash_k_ctx_cache.empty()) {
|
if (dflash.kv.cache_ctx != nullptr && !dflash.kv.k_ctx_cache.empty()) {
|
||||||
const bool cache_matches = (int32_t) dflash_k_ctx_cache.size() == n_layer &&
|
const bool cache_matches = (int32_t) dflash.kv.k_ctx_cache.size() == n_layer &&
|
||||||
dflash_k_ctx_cache.front() != nullptr &&
|
dflash.kv.k_ctx_cache.front() != nullptr &&
|
||||||
(int32_t) dflash_k_ctx_cache.front()->ne[2] == target_cross_ctx;
|
(int32_t) dflash.kv.k_ctx_cache.front()->ne[2] == target_cross_ctx;
|
||||||
const bool workspace_matches = (int32_t) dflash_k_ctx_workspace.size() == n_layer &&
|
const bool workspace_matches = (int32_t) dflash.kv.k_ctx_workspace.size() == n_layer &&
|
||||||
dflash_k_ctx_workspace.front() != nullptr &&
|
dflash.kv.k_ctx_workspace.front() != nullptr &&
|
||||||
(int32_t) dflash_k_ctx_workspace.front()->ne[1] == target_workspace_n_kv_total;
|
(int32_t) dflash.kv.k_ctx_workspace.front()->ne[1] == target_workspace_n_kv_total;
|
||||||
|
|
||||||
if (cache_matches && workspace_matches) {
|
if (cache_matches && workspace_matches) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
if (dflash_sched != nullptr) {
|
if (dflash.kv.cache_sched != nullptr) {
|
||||||
ggml_backend_sched_free(dflash_sched);
|
ggml_backend_sched_free(dflash.kv.cache_sched);
|
||||||
dflash_sched = nullptr;
|
dflash.kv.cache_sched = nullptr;
|
||||||
}
|
}
|
||||||
if (dflash_workspace_sched != nullptr) {
|
if (dflash.kv.workspace_sched != nullptr) {
|
||||||
ggml_backend_sched_free(dflash_workspace_sched);
|
ggml_backend_sched_free(dflash.kv.workspace_sched);
|
||||||
dflash_workspace_sched = nullptr;
|
dflash.kv.workspace_sched = nullptr;
|
||||||
}
|
}
|
||||||
dflash_kv_graph = nullptr;
|
dflash.kv.cache_graph = nullptr;
|
||||||
dflash_kv_workspace_graph = nullptr;
|
dflash.kv.workspace_graph = nullptr;
|
||||||
dflash_kv_graph_rows = 0;
|
dflash.kv.cache_graph_rows = 0;
|
||||||
dflash_kv_graph_write_pos = 0;
|
dflash.kv.cache_graph_write_pos = 0;
|
||||||
dflash_kv_workspace_graph_rows = 0;
|
dflash.kv.workspace_graph_rows = 0;
|
||||||
dflash_kv_workspace_graph_write_pos = 0;
|
dflash.kv.workspace_graph_write_pos = 0;
|
||||||
dflash_kv_workspace_reserved_rows = 0;
|
dflash.kv.workspace_reserved_rows = 0;
|
||||||
dflash_buf_compute_meta.clear();
|
dflash.kv.cache_compute_meta.clear();
|
||||||
dflash_workspace_buf_compute_meta.clear();
|
dflash.kv.workspace_compute_meta.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_init_params params = {
|
ggml_init_params params = {
|
||||||
@ -124,166 +108,146 @@ bool llama_context::ensure_dflash_kv_cache_tensors(int32_t cross_ctx) {
|
|||||||
/*.no_alloc =*/ true,
|
/*.no_alloc =*/ true,
|
||||||
};
|
};
|
||||||
|
|
||||||
dflash_cache_ctx = ggml_init(params);
|
dflash.kv.cache_ctx = ggml_init(params);
|
||||||
if (dflash_cache_ctx == nullptr) {
|
if (dflash.kv.cache_ctx == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
dflash_k_ctx_cache.resize((size_t) n_layer);
|
dflash.kv.k_ctx_cache.resize((size_t) n_layer);
|
||||||
dflash_v_ctx_cache.resize((size_t) n_layer);
|
dflash.kv.v_ctx_cache.resize((size_t) n_layer);
|
||||||
dflash_k_ctx_workspace.clear();
|
dflash.kv.k_ctx_workspace.clear();
|
||||||
dflash_v_ctx_workspace.clear();
|
dflash.kv.v_ctx_workspace.clear();
|
||||||
dflash_k_ctx_workspace.resize((size_t) n_layer);
|
dflash.kv.k_ctx_workspace.resize((size_t) n_layer);
|
||||||
dflash_v_ctx_workspace.resize((size_t) n_layer);
|
dflash.kv.v_ctx_workspace.resize((size_t) n_layer);
|
||||||
dflash_cache_bufs.clear();
|
dflash.kv.cache_bufs.clear();
|
||||||
dflash_cache_bufs.reserve((size_t) std::max(1, n_layer) * 4);
|
dflash.kv.cache_bufs.reserve((size_t) std::max(1, n_layer) * 4);
|
||||||
int32_t host_layers = 0;
|
|
||||||
const char * first_buft_name = nullptr;
|
|
||||||
const char * last_buft_name = nullptr;
|
|
||||||
for (int32_t il = 0; il < n_layer; ++il) {
|
for (int32_t il = 0; il < n_layer; ++il) {
|
||||||
ggml_backend_buffer_type_t layer_buft = llama_dflash_kv_cache_layer_buft(*this, il);
|
ggml_backend_buffer_type_t layer_buft = llama_dflash_kv_cache_layer_buft(*this, il);
|
||||||
if (ggml_backend_buft_is_host(layer_buft)) {
|
|
||||||
host_layers++;
|
|
||||||
}
|
|
||||||
if (first_buft_name == nullptr) {
|
|
||||||
first_buft_name = ggml_backend_buft_name(layer_buft);
|
|
||||||
}
|
|
||||||
last_buft_name = ggml_backend_buft_name(layer_buft);
|
|
||||||
|
|
||||||
dflash_k_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash_cache_ctx, GGML_TYPE_F32, n_embd_head_k, n_head_kv, target_cross_ctx);
|
dflash.kv.k_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_k, n_head_kv, target_cross_ctx);
|
||||||
dflash_v_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash_cache_ctx, GGML_TYPE_F32, n_embd_head_v, n_head_kv, target_cross_ctx);
|
dflash.kv.v_ctx_cache[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_v, n_head_kv, target_cross_ctx);
|
||||||
if (dflash_k_ctx_cache[(size_t) il] == nullptr || dflash_v_ctx_cache[(size_t) il] == nullptr) {
|
if (dflash.kv.k_ctx_cache[(size_t) il] == nullptr || dflash.kv.v_ctx_cache[(size_t) il] == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_input(dflash_k_ctx_cache[(size_t) il]);
|
ggml_set_input(dflash.kv.k_ctx_cache[(size_t) il]);
|
||||||
ggml_set_input(dflash_v_ctx_cache[(size_t) il]);
|
ggml_set_input(dflash.kv.v_ctx_cache[(size_t) il]);
|
||||||
ggml_format_name(dflash_k_ctx_cache[(size_t) il], "dflash_k_ctx_cache_%d", il);
|
ggml_format_name(dflash.kv.k_ctx_cache[(size_t) il], "dflash_k_ctx_cache_%d", il);
|
||||||
ggml_format_name(dflash_v_ctx_cache[(size_t) il], "dflash_v_ctx_cache_%d", il);
|
ggml_format_name(dflash.kv.v_ctx_cache[(size_t) il], "dflash_v_ctx_cache_%d", il);
|
||||||
|
|
||||||
const size_t k_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash_k_ctx_cache[(size_t) il]);
|
const size_t k_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.k_ctx_cache[(size_t) il]);
|
||||||
ggml_backend_buffer_t k_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_bytes);
|
ggml_backend_buffer_t k_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_bytes);
|
||||||
if (k_buf == nullptr) {
|
if (k_buf == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(k_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(k_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
ggml_backend_tensor_alloc(k_buf, dflash_k_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(k_buf));
|
ggml_backend_tensor_alloc(k_buf, dflash.kv.k_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(k_buf));
|
||||||
ggml_backend_buffer_clear(k_buf, 0);
|
ggml_backend_buffer_clear(k_buf, 0);
|
||||||
dflash_cache_bufs.push_back(k_buf);
|
dflash.kv.cache_bufs.push_back(k_buf);
|
||||||
|
|
||||||
const size_t v_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash_v_ctx_cache[(size_t) il]);
|
const size_t v_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.v_ctx_cache[(size_t) il]);
|
||||||
ggml_backend_buffer_t v_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_bytes);
|
ggml_backend_buffer_t v_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_bytes);
|
||||||
if (v_buf == nullptr) {
|
if (v_buf == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(v_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(v_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
ggml_backend_tensor_alloc(v_buf, dflash_v_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(v_buf));
|
ggml_backend_tensor_alloc(v_buf, dflash.kv.v_ctx_cache[(size_t) il], ggml_backend_buffer_get_base(v_buf));
|
||||||
ggml_backend_buffer_clear(v_buf, 0);
|
ggml_backend_buffer_clear(v_buf, 0);
|
||||||
dflash_cache_bufs.push_back(v_buf);
|
dflash.kv.cache_bufs.push_back(v_buf);
|
||||||
|
|
||||||
dflash_k_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash_cache_ctx, GGML_TYPE_F32, n_embd_head_k, target_workspace_n_kv_total, n_head_kv);
|
dflash.kv.k_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_k, target_workspace_n_kv_total, n_head_kv);
|
||||||
dflash_v_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash_cache_ctx, GGML_TYPE_F32, n_embd_head_v, target_workspace_n_kv_total, n_head_kv);
|
dflash.kv.v_ctx_workspace[(size_t) il] = ggml_new_tensor_3d(dflash.kv.cache_ctx, GGML_TYPE_F32, n_embd_head_v, target_workspace_n_kv_total, n_head_kv);
|
||||||
if (dflash_k_ctx_workspace[(size_t) il] == nullptr || dflash_v_ctx_workspace[(size_t) il] == nullptr) {
|
if (dflash.kv.k_ctx_workspace[(size_t) il] == nullptr || dflash.kv.v_ctx_workspace[(size_t) il] == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_input(dflash_k_ctx_workspace[(size_t) il]);
|
ggml_set_input(dflash.kv.k_ctx_workspace[(size_t) il]);
|
||||||
ggml_set_input(dflash_v_ctx_workspace[(size_t) il]);
|
ggml_set_input(dflash.kv.v_ctx_workspace[(size_t) il]);
|
||||||
ggml_format_name(dflash_k_ctx_workspace[(size_t) il], "dflash_k_ctx_workspace_%d", il);
|
ggml_format_name(dflash.kv.k_ctx_workspace[(size_t) il], "dflash_k_ctx_workspace_%d", il);
|
||||||
ggml_format_name(dflash_v_ctx_workspace[(size_t) il], "dflash_v_ctx_workspace_%d", il);
|
ggml_format_name(dflash.kv.v_ctx_workspace[(size_t) il], "dflash_v_ctx_workspace_%d", il);
|
||||||
|
|
||||||
const size_t k_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash_k_ctx_workspace[(size_t) il]);
|
const size_t k_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.k_ctx_workspace[(size_t) il]);
|
||||||
ggml_backend_buffer_t k_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_workspace_bytes);
|
ggml_backend_buffer_t k_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, k_workspace_bytes);
|
||||||
if (k_workspace_buf == nullptr) {
|
if (k_workspace_buf == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(k_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(k_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
ggml_backend_tensor_alloc(k_workspace_buf, dflash_k_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(k_workspace_buf));
|
ggml_backend_tensor_alloc(k_workspace_buf, dflash.kv.k_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(k_workspace_buf));
|
||||||
ggml_backend_buffer_clear(k_workspace_buf, 0);
|
ggml_backend_buffer_clear(k_workspace_buf, 0);
|
||||||
dflash_cache_bufs.push_back(k_workspace_buf);
|
dflash.kv.cache_bufs.push_back(k_workspace_buf);
|
||||||
|
|
||||||
const size_t v_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash_v_ctx_workspace[(size_t) il]);
|
const size_t v_workspace_bytes = ggml_backend_buft_get_alloc_size(layer_buft, dflash.kv.v_ctx_workspace[(size_t) il]);
|
||||||
ggml_backend_buffer_t v_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_workspace_bytes);
|
ggml_backend_buffer_t v_workspace_buf = ggml_backend_buft_alloc_buffer(layer_buft, v_workspace_bytes);
|
||||||
if (v_workspace_buf == nullptr) {
|
if (v_workspace_buf == nullptr) {
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(v_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(v_workspace_buf, GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
ggml_backend_tensor_alloc(v_workspace_buf, dflash_v_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(v_workspace_buf));
|
ggml_backend_tensor_alloc(v_workspace_buf, dflash.kv.v_ctx_workspace[(size_t) il], ggml_backend_buffer_get_base(v_workspace_buf));
|
||||||
ggml_backend_buffer_clear(v_workspace_buf, 0);
|
ggml_backend_buffer_clear(v_workspace_buf, 0);
|
||||||
dflash_cache_bufs.push_back(v_workspace_buf);
|
dflash.kv.cache_bufs.push_back(v_workspace_buf);
|
||||||
}
|
}
|
||||||
|
|
||||||
dflash_profile.last_kv_cache_host_layers = host_layers;
|
dflash.kv.workspace_token_capacity = target_token_capacity;
|
||||||
dflash_kv_workspace_token_capacity = target_token_capacity;
|
dflash.kv.workspace_n_kv_total = target_workspace_n_kv_total;
|
||||||
dflash_kv_workspace_n_kv_total = target_workspace_n_kv_total;
|
|
||||||
llama_reset_dflash_kv_cache_state(this);
|
llama_reset_dflash_kv_cache_state(this);
|
||||||
if (llama_dflash_stats_log_enabled()) {
|
|
||||||
LLAMA_LOG_INFO("%s: DFlash K/V cache placement cross_ctx=%d host_layers=%d/%d first=%s last=%s\n",
|
|
||||||
__func__,
|
|
||||||
target_cross_ctx,
|
|
||||||
host_layers,
|
|
||||||
n_layer,
|
|
||||||
first_buft_name != nullptr ? first_buft_name : "(none)",
|
|
||||||
last_buft_name != nullptr ? last_buft_name : "(none)");
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_context::free_dflash_kv_cache_tensors() {
|
void llama_context::free_dflash_kv_cache_tensors() {
|
||||||
dflash_k_ctx_cache.clear();
|
dflash.kv.k_ctx_cache.clear();
|
||||||
dflash_v_ctx_cache.clear();
|
dflash.kv.v_ctx_cache.clear();
|
||||||
dflash_k_ctx_workspace.clear();
|
dflash.kv.k_ctx_workspace.clear();
|
||||||
dflash_v_ctx_workspace.clear();
|
dflash.kv.v_ctx_workspace.clear();
|
||||||
dflash_kv_cache_write_pos = 0;
|
dflash.kv.cache_write_pos = 0;
|
||||||
dflash_kv_cache_n_filled = 0;
|
dflash.kv.cache_n_filled = 0;
|
||||||
dflash_kv_cache_update_rows = 0;
|
dflash.kv.cache_update_rows = 0;
|
||||||
dflash_kv_cache_reserved_rows = 0;
|
dflash.kv.cache_reserved_rows = 0;
|
||||||
dflash_kv_cache_view_write_pos = 0;
|
dflash.kv.cache_view_write_pos = 0;
|
||||||
dflash_kv_cache_view_n_filled = 0;
|
dflash.kv.cache_view_n_filled = 0;
|
||||||
dflash_kv_cache_applied_window_version = 0;
|
dflash.kv.cache_applied_window_version = 0;
|
||||||
dflash_kv_cache_valid = false;
|
dflash.kv.cache_valid = false;
|
||||||
dflash_kv_cache_view_valid = false;
|
dflash.kv.cache_view_valid = false;
|
||||||
dflash_kv_workspace_write_pos = 0;
|
dflash.kv.workspace_write_pos = 0;
|
||||||
dflash_kv_workspace_n_filled = 0;
|
dflash.kv.workspace_n_filled = 0;
|
||||||
dflash_kv_workspace_reserved_rows = 0;
|
dflash.kv.workspace_reserved_rows = 0;
|
||||||
dflash_kv_workspace_token_capacity = 0;
|
dflash.kv.workspace_token_capacity = 0;
|
||||||
dflash_kv_workspace_n_kv_total = 0;
|
dflash.kv.workspace_n_kv_total = 0;
|
||||||
dflash_kv_workspace_applied_window_version = 0;
|
dflash.kv.workspace_applied_window_version = 0;
|
||||||
dflash_kv_workspace_valid = false;
|
dflash.kv.workspace_valid = false;
|
||||||
dflash_kv_workspace_sync_pending = false;
|
dflash.kv.workspace_sync_pending = false;
|
||||||
dflash_kv_graph = nullptr;
|
dflash.kv.cache_graph = nullptr;
|
||||||
dflash_kv_workspace_graph = nullptr;
|
dflash.kv.workspace_graph = nullptr;
|
||||||
dflash_kv_graph_rows = 0;
|
dflash.kv.cache_graph_rows = 0;
|
||||||
dflash_kv_graph_write_pos = 0;
|
dflash.kv.cache_graph_write_pos = 0;
|
||||||
dflash_kv_workspace_graph_rows = 0;
|
dflash.kv.workspace_graph_rows = 0;
|
||||||
dflash_kv_workspace_graph_write_pos = 0;
|
dflash.kv.workspace_graph_write_pos = 0;
|
||||||
dflash_kv_input_target_features = nullptr;
|
dflash.kv.cache_input_target_features = nullptr;
|
||||||
dflash_kv_input_pos_ctx = nullptr;
|
dflash.kv.cache_input_pos_ctx = nullptr;
|
||||||
dflash_kq_mask_tensor = nullptr;
|
dflash.kv.kq_mask_tensor = nullptr;
|
||||||
dflash_kq_mask_swa_tensor = nullptr;
|
dflash.kv.kq_mask_swa_tensor = nullptr;
|
||||||
|
|
||||||
if (dflash_workspace_sched != nullptr) {
|
if (dflash.kv.workspace_sched != nullptr) {
|
||||||
ggml_backend_sched_synchronize(dflash_workspace_sched);
|
ggml_backend_sched_synchronize(dflash.kv.workspace_sched);
|
||||||
ggml_backend_sched_free(dflash_workspace_sched);
|
ggml_backend_sched_free(dflash.kv.workspace_sched);
|
||||||
dflash_workspace_sched = nullptr;
|
dflash.kv.workspace_sched = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (ggml_backend_buffer_t buf : dflash_cache_bufs) {
|
for (ggml_backend_buffer_t buf : dflash.kv.cache_bufs) {
|
||||||
if (buf != nullptr) {
|
if (buf != nullptr) {
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dflash_cache_bufs.clear();
|
dflash.kv.cache_bufs.clear();
|
||||||
if (dflash_cache_ctx != nullptr) {
|
if (dflash.kv.cache_ctx != nullptr) {
|
||||||
ggml_free(dflash_cache_ctx);
|
ggml_free(dflash.kv.cache_ctx);
|
||||||
dflash_cache_ctx = nullptr;
|
dflash.kv.cache_ctx = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -418,13 +382,11 @@ static bool validate_dflash_graph_contract(const llama_context & lctx) {
|
|||||||
bool llama_prepare_dflash_graph_inputs(
|
bool llama_prepare_dflash_graph_inputs(
|
||||||
struct llama_context & lctx,
|
struct llama_context & lctx,
|
||||||
uint32_t n_tokens) {
|
uint32_t n_tokens) {
|
||||||
const bool kv_node_timing = llama_env_flag_enabled_local("IK_DFLASH_KV_NODE_TIMING");
|
const int32_t cross_ctx = lctx.dflash.visible_cross_ctx > 0
|
||||||
auto & profile = lctx.dflash_profile;
|
? lctx.dflash.visible_cross_ctx
|
||||||
const int32_t cross_ctx = lctx.dflash_visible_cross_ctx > 0
|
|
||||||
? lctx.dflash_visible_cross_ctx
|
|
||||||
: std::max<int32_t>(1, (int32_t) lctx.cparams.n_ctx - (int32_t) lctx.model.hparams.dflash_block_size);
|
: std::max<int32_t>(1, (int32_t) lctx.cparams.n_ctx - (int32_t) lctx.model.hparams.dflash_block_size);
|
||||||
ggml_tensor * kq_mask = lctx.dflash_kq_mask_tensor;
|
ggml_tensor * kq_mask = lctx.dflash.kv.kq_mask_tensor;
|
||||||
ggml_tensor * kq_mask_swa = lctx.dflash_kq_mask_swa_tensor;
|
ggml_tensor * kq_mask_swa = lctx.dflash.kv.kq_mask_swa_tensor;
|
||||||
|
|
||||||
if (kq_mask == nullptr) {
|
if (kq_mask == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: DFlash graph inputs are not initialized\n", __func__);
|
LLAMA_LOG_ERROR("%s: DFlash graph inputs are not initialized\n", __func__);
|
||||||
@ -432,113 +394,84 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!validate_dflash_graph_contract(lctx)) {
|
if (!validate_dflash_graph_contract(lctx)) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!lctx.ensure_dflash_kv_cache_tensors(cross_ctx) || lctx.dflash_k_ctx_cache.empty() || lctx.dflash_v_ctx_cache.empty()) {
|
if (!lctx.ensure_dflash_kv_cache_tensors(cross_ctx) || lctx.dflash.kv.k_ctx_cache.empty() || lctx.dflash.kv.v_ctx_cache.empty()) {
|
||||||
LLAMA_LOG_ERROR("%s: DFlash K/V cache inputs are not initialized\n", __func__);
|
LLAMA_LOG_ERROR("%s: DFlash K/V cache inputs are not initialized\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const float * src = lctx.dflash_target_features;
|
const float * src = lctx.dflash.target.features;
|
||||||
const float * append_src = lctx.dflash_target_append_features;
|
const float * append_src = lctx.dflash.target.append_features;
|
||||||
const llama_pos * src_pos = lctx.dflash_target_positions;
|
const llama_pos * src_pos = lctx.dflash.target.positions;
|
||||||
const size_t total_floats = lctx.dflash_target_features_n_floats;
|
const size_t total_floats = lctx.dflash.target.features_n_floats;
|
||||||
const size_t append_floats = lctx.dflash_target_append_features_n_floats;
|
const size_t append_floats = lctx.dflash.target.append_features_n_floats;
|
||||||
const size_t total_positions = lctx.dflash_target_positions_n;
|
const size_t total_positions = lctx.dflash.target.positions_n;
|
||||||
const int32_t n_rows = lctx.dflash_target_features_n_rows;
|
const int32_t n_rows = lctx.dflash.target.features_n_rows;
|
||||||
const int32_t append_rows_available = lctx.dflash_target_append_features_n_rows;
|
const int32_t append_rows_available = lctx.dflash.target.append_features_n_rows;
|
||||||
const int32_t width = (int32_t) lctx.model.hparams.dflash_n_target_features;
|
const int32_t width = (int32_t) lctx.model.hparams.dflash_n_target_features;
|
||||||
const int32_t graph_cross_ctx = lctx.dflash_k_ctx_cache.front() != nullptr
|
const int32_t graph_cross_ctx = lctx.dflash.kv.k_ctx_cache.front() != nullptr
|
||||||
? (int32_t) lctx.dflash_k_ctx_cache.front()->ne[2]
|
? (int32_t) lctx.dflash.kv.k_ctx_cache.front()->ne[2]
|
||||||
: 0;
|
: 0;
|
||||||
const int32_t n_mask_tokens = (int32_t) kq_mask->ne[1];
|
const int32_t n_mask_tokens = (int32_t) kq_mask->ne[1];
|
||||||
const int32_t n_kv_total = (int32_t) kq_mask->ne[0];
|
const int32_t n_kv_total = (int32_t) kq_mask->ne[0];
|
||||||
const int64_t t_total_us = ggml_time_us();
|
|
||||||
|
|
||||||
profile.graph_prepare_calls++;
|
|
||||||
profile.last_n_rows = n_rows;
|
|
||||||
profile.last_width = width;
|
|
||||||
profile.last_cross_ctx = cross_ctx;
|
|
||||||
profile.last_n_tokens = (int32_t) n_tokens;
|
|
||||||
profile.last_n_kv_total = n_kv_total;
|
|
||||||
|
|
||||||
llama_sync_dflash_workspace_if_pending(lctx);
|
llama_sync_dflash_workspace_if_pending(lctx);
|
||||||
|
|
||||||
if (graph_cross_ctx != cross_ctx) {
|
if (graph_cross_ctx != cross_ctx) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
|
|
||||||
LLAMA_LOG_ERROR("%s: DFlash graph cross_ctx drift (graph=%d configured=%d)\n",
|
LLAMA_LOG_ERROR("%s: DFlash graph cross_ctx drift (graph=%d configured=%d)\n",
|
||||||
__func__, graph_cross_ctx, cross_ctx);
|
__func__, graph_cross_ctx, cross_ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (n_rows <= 0) {
|
if (n_rows <= 0) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: missing DFlash target feature rows\n", __func__);
|
LLAMA_LOG_ERROR("%s: missing DFlash target feature rows\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool have_full_src = src != nullptr && total_floats == (size_t) n_rows * (size_t) width;
|
const bool have_full_src = src != nullptr && total_floats == (size_t) n_rows * (size_t) width;
|
||||||
if (n_rows > cross_ctx || (src != nullptr && !have_full_src)) {
|
if (n_rows > cross_ctx || (src != nullptr && !have_full_src)) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid DFlash target feature shape (rows=%d width=%d floats=%zu cross_ctx=%d)\n",
|
LLAMA_LOG_ERROR("%s: invalid DFlash target feature shape (rows=%d width=%d floats=%zu cross_ctx=%d)\n",
|
||||||
__func__, n_rows, width, total_floats, cross_ctx);
|
__func__, n_rows, width, total_floats, cross_ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_kv_total < cross_ctx + (int32_t) n_tokens) {
|
if (n_kv_total < cross_ctx + (int32_t) n_tokens) {
|
||||||
profile.graph_mask_overflow++;
|
|
||||||
LLAMA_LOG_ERROR("%s: invalid DFlash mask shape (n_kv_total=%d < cross_ctx+n_tokens=%d)\n",
|
LLAMA_LOG_ERROR("%s: invalid DFlash mask shape (n_kv_total=%d < cross_ctx+n_tokens=%d)\n",
|
||||||
__func__, n_kv_total, cross_ctx + (int32_t) n_tokens);
|
__func__, n_kv_total, cross_ctx + (int32_t) n_tokens);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t left_pad = cross_ctx - n_rows;
|
const int32_t left_pad = cross_ctx - n_rows;
|
||||||
profile.last_left_pad = left_pad;
|
|
||||||
|
|
||||||
const int64_t t_pos_us = ggml_time_us();
|
lctx.dflash.target.pos_ctx_data.resize((size_t) cross_ctx);
|
||||||
lctx.dflash_pos_ctx_data.resize((size_t) cross_ctx);
|
std::fill(lctx.dflash.target.pos_ctx_data.begin(), lctx.dflash.target.pos_ctx_data.end(), 0);
|
||||||
std::fill(lctx.dflash_pos_ctx_data.begin(), lctx.dflash_pos_ctx_data.end(), 0);
|
|
||||||
if (src_pos == nullptr || total_positions != (size_t) n_rows) {
|
if (src_pos == nullptr || total_positions != (size_t) n_rows) {
|
||||||
profile.graph_pos_fallbacks++;
|
LLAMA_LOG_ERROR("%s: missing DFlash target positions (rows=%d positions=%zu cross_ctx=%d)\n",
|
||||||
profile.graph_shape_failures++;
|
__func__, n_rows, total_positions, cross_ctx);
|
||||||
profile.last_pos_first = -1;
|
|
||||||
profile.last_pos_last = -1;
|
|
||||||
if (profile.graph_pos_fallbacks <= 3) {
|
|
||||||
LLAMA_LOG_ERROR("%s: missing DFlash target positions (rows=%d positions=%zu cross_ctx=%d)\n",
|
|
||||||
__func__, n_rows, total_positions, cross_ctx);
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
profile.last_pos_first = src_pos[0];
|
const llama_pos last_target_pos = src_pos[n_rows - 1];
|
||||||
profile.last_pos_last = src_pos[n_rows - 1];
|
|
||||||
for (int32_t i = 1; i < n_rows; ++i) {
|
for (int32_t i = 1; i < n_rows; ++i) {
|
||||||
if (src_pos[i] <= src_pos[i - 1]) {
|
if (src_pos[i] <= src_pos[i - 1]) {
|
||||||
profile.graph_pos_non_monotonic++;
|
LLAMA_LOG_ERROR("%s: DFlash target positions are not strictly increasing (rows=%d first=%d last=%d)\n",
|
||||||
profile.graph_shape_failures++;
|
__func__, n_rows, (int) src_pos[0], (int) src_pos[n_rows - 1]);
|
||||||
if (profile.graph_pos_non_monotonic <= 3) {
|
|
||||||
LLAMA_LOG_ERROR("%s: DFlash target positions are not strictly increasing (rows=%d first=%d last=%d)\n",
|
|
||||||
__func__, n_rows, (int) src_pos[0], (int) src_pos[n_rows - 1]);
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::copy(src_pos, src_pos + n_rows, lctx.dflash_pos_ctx_data.begin() + (ptrdiff_t) left_pad);
|
std::copy(src_pos, src_pos + n_rows, lctx.dflash.target.pos_ctx_data.begin() + (ptrdiff_t) left_pad);
|
||||||
profile.graph_pos_copy_us += (uint64_t) (ggml_time_us() - t_pos_us);
|
|
||||||
profile.graph_pos_bytes += lctx.dflash_pos_ctx_data.size() * sizeof(llama_pos);
|
|
||||||
|
|
||||||
const llama_dflash_kv_cache_transition cache_plan = llama_plan_dflash_kv_cache_transition(
|
const llama_dflash_kv_cache_transition cache_plan = llama_plan_dflash_kv_cache_transition(
|
||||||
cross_ctx,
|
cross_ctx,
|
||||||
lctx.dflash_kv_cache_n_filled,
|
lctx.dflash.kv.cache_n_filled,
|
||||||
lctx.dflash_kv_cache_write_pos,
|
lctx.dflash.kv.cache_write_pos,
|
||||||
lctx.dflash_kv_cache_valid,
|
lctx.dflash.kv.cache_valid,
|
||||||
lctx.dflash_kv_cache_applied_window_version,
|
lctx.dflash.kv.cache_applied_window_version,
|
||||||
lctx.dflash_target_window_version,
|
lctx.dflash.target.version,
|
||||||
lctx.dflash_target_window_keep_rows,
|
lctx.dflash.target.keep_rows,
|
||||||
lctx.dflash_target_window_append_rows,
|
lctx.dflash.target.append_rows,
|
||||||
lctx.dflash_target_window_replace,
|
lctx.dflash.target.replace,
|
||||||
n_rows);
|
n_rows);
|
||||||
|
|
||||||
const bool have_append_src = append_src != nullptr &&
|
const bool have_append_src = append_src != nullptr &&
|
||||||
@ -550,11 +483,11 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
: (cache_plan.rebuild_cache ? n_rows : cache_plan.append_rows);
|
: (cache_plan.rebuild_cache ? n_rows : cache_plan.append_rows);
|
||||||
const size_t max_nodes = lctx.model.max_nodes((int) std::max<int32_t>(1, cross_ctx)) + 24 * lctx.model.hparams.n_layer;
|
const size_t max_nodes = lctx.model.max_nodes((int) std::max<int32_t>(1, cross_ctx)) + 24 * lctx.model.hparams.n_layer;
|
||||||
const size_t meta_size = ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false);
|
const size_t meta_size = ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false);
|
||||||
if (lctx.dflash_buf_compute_meta.size() != meta_size) {
|
if (lctx.dflash.kv.cache_compute_meta.size() != meta_size) {
|
||||||
lctx.dflash_buf_compute_meta.resize(meta_size);
|
lctx.dflash.kv.cache_compute_meta.resize(meta_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.dflash_sched == nullptr || lctx.dflash_kv_cache_reserved_rows != cross_ctx) {
|
if (lctx.dflash.kv.cache_sched == nullptr || lctx.dflash.kv.cache_reserved_rows != cross_ctx) {
|
||||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||||
backend_buft.reserve(lctx.backends.size());
|
backend_buft.reserve(lctx.backends.size());
|
||||||
for (auto * backend : lctx.backends) {
|
for (auto * backend : lctx.backends) {
|
||||||
@ -565,36 +498,30 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.dflash_sched != nullptr) {
|
if (lctx.dflash.kv.cache_sched != nullptr) {
|
||||||
ggml_backend_sched_free(lctx.dflash_sched);
|
ggml_backend_sched_free(lctx.dflash.kv.cache_sched);
|
||||||
lctx.dflash_sched = nullptr;
|
lctx.dflash.kv.cache_sched = nullptr;
|
||||||
}
|
}
|
||||||
lctx.dflash_kv_graph = nullptr;
|
lctx.dflash.kv.cache_graph = nullptr;
|
||||||
lctx.dflash_kv_graph_rows = 0;
|
lctx.dflash.kv.cache_graph_rows = 0;
|
||||||
lctx.dflash_kv_graph_write_pos = 0;
|
lctx.dflash.kv.cache_graph_write_pos = 0;
|
||||||
|
|
||||||
const int32_t saved_update_rows = lctx.dflash_kv_cache_update_rows;
|
const int32_t saved_update_rows = lctx.dflash.kv.cache_update_rows;
|
||||||
lctx.dflash_kv_cache_update_rows = cross_ctx;
|
lctx.dflash.kv.cache_update_rows = cross_ctx;
|
||||||
const int64_t t_build_us = ggml_time_us();
|
|
||||||
ggml_cgraph * gf_reserve = llm_build_context::llama_build_graph_dflash_kv_cache(lctx);
|
ggml_cgraph * gf_reserve = llm_build_context::llama_build_graph_dflash_kv_cache(lctx);
|
||||||
profile.graph_kv_cache_build_us += (uint64_t) (ggml_time_us() - t_build_us);
|
lctx.dflash.kv.cache_update_rows = saved_update_rows;
|
||||||
lctx.dflash_kv_cache_update_rows = saved_update_rows;
|
|
||||||
if (gf_reserve == nullptr) {
|
if (gf_reserve == nullptr) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V cache reserve graph\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V cache reserve graph\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_reserve_us = ggml_time_us();
|
lctx.dflash.kv.cache_sched = ggml_backend_sched_new(lctx.backends.data(), backend_buft.data(), lctx.backends.size(), max_nodes, false);
|
||||||
lctx.dflash_sched = ggml_backend_sched_new(lctx.backends.data(), backend_buft.data(), lctx.backends.size(), max_nodes, false);
|
const bool reserved = lctx.dflash.kv.cache_sched != nullptr && ggml_backend_sched_reserve(lctx.dflash.kv.cache_sched, gf_reserve);
|
||||||
const bool reserved = lctx.dflash_sched != nullptr && ggml_backend_sched_reserve(lctx.dflash_sched, gf_reserve);
|
|
||||||
profile.graph_kv_cache_reserve_us += (uint64_t) (ggml_time_us() - t_reserve_us);
|
|
||||||
if (!reserved) {
|
if (!reserved) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize DFlash K/V scheduler\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize DFlash K/V scheduler\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
lctx.dflash_kv_cache_reserved_rows = cross_ctx;
|
lctx.dflash.kv.cache_reserved_rows = cross_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (update_rows > 0) {
|
if (update_rows > 0) {
|
||||||
@ -607,7 +534,6 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
const llama_pos * update_pos = src_pos + (n_rows - update_rows);
|
const llama_pos * update_pos = src_pos + (n_rows - update_rows);
|
||||||
|
|
||||||
if (update_src == nullptr) {
|
if (update_src == nullptr) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: missing DFlash appended target features for cached update (rows=%d append_rows=%d floats=%zu)\n",
|
LLAMA_LOG_ERROR("%s: missing DFlash appended target features for cached update (rows=%d append_rows=%d floats=%zu)\n",
|
||||||
__func__, n_rows, update_rows, append_floats);
|
__func__, n_rows, update_rows, append_floats);
|
||||||
return false;
|
return false;
|
||||||
@ -617,108 +543,77 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
llama_reset_dflash_kv_cache_state(&lctx);
|
llama_reset_dflash_kv_cache_state(&lctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.dflash_kv_cache_update_rows = update_rows;
|
lctx.dflash.kv.cache_update_rows = update_rows;
|
||||||
ggml_cgraph * gf_kv = nullptr;
|
ggml_cgraph * gf_kv = nullptr;
|
||||||
const bool can_reuse_kv_graph = lctx.dflash_kv_graph != nullptr &&
|
const bool can_reuse_kv_graph = lctx.dflash.kv.cache_graph != nullptr &&
|
||||||
lctx.dflash_kv_graph_rows == update_rows &&
|
lctx.dflash.kv.cache_graph_rows == update_rows &&
|
||||||
lctx.dflash_kv_graph_write_pos == lctx.dflash_kv_cache_write_pos;
|
lctx.dflash.kv.cache_graph_write_pos == lctx.dflash.kv.cache_write_pos;
|
||||||
if (can_reuse_kv_graph) {
|
if (can_reuse_kv_graph) {
|
||||||
gf_kv = lctx.dflash_kv_graph;
|
gf_kv = lctx.dflash.kv.cache_graph;
|
||||||
} else {
|
} else {
|
||||||
const int64_t t_build_us = ggml_time_us();
|
|
||||||
gf_kv = llm_build_context::llama_build_graph_dflash_kv_cache(lctx);
|
gf_kv = llm_build_context::llama_build_graph_dflash_kv_cache(lctx);
|
||||||
profile.graph_kv_cache_build_us += (uint64_t) (ggml_time_us() - t_build_us);
|
if (gf_kv == nullptr || lctx.dflash.kv.cache_input_target_features == nullptr || lctx.dflash.kv.cache_input_pos_ctx == nullptr) {
|
||||||
if (gf_kv == nullptr || lctx.dflash_kv_input_target_features == nullptr || lctx.dflash_kv_input_pos_ctx == nullptr) {
|
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V cache graph\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V cache graph\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_reset_us = ggml_time_us();
|
ggml_backend_sched_reset(lctx.dflash.kv.cache_sched);
|
||||||
ggml_backend_sched_reset(lctx.dflash_sched);
|
ggml_backend_sched_alloc_graph(lctx.dflash.kv.cache_sched, gf_kv);
|
||||||
profile.graph_kv_cache_reset_us += (uint64_t) (ggml_time_us() - t_reset_us);
|
|
||||||
|
|
||||||
const int64_t t_alloc_us = ggml_time_us();
|
lctx.dflash.kv.cache_graph = gf_kv;
|
||||||
ggml_backend_sched_alloc_graph(lctx.dflash_sched, gf_kv);
|
lctx.dflash.kv.cache_graph_rows = update_rows;
|
||||||
profile.graph_kv_cache_alloc_us += (uint64_t) (ggml_time_us() - t_alloc_us);
|
lctx.dflash.kv.cache_graph_write_pos = lctx.dflash.kv.cache_write_pos;
|
||||||
|
|
||||||
lctx.dflash_kv_graph = gf_kv;
|
|
||||||
lctx.dflash_kv_graph_rows = update_rows;
|
|
||||||
lctx.dflash_kv_graph_write_pos = lctx.dflash_kv_cache_write_pos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_t kv_feature_backend = llama_backend_for_tensor(lctx, lctx.dflash_kv_input_target_features);
|
ggml_backend_t kv_feature_backend = llama_backend_for_tensor(lctx, lctx.dflash.kv.cache_input_target_features);
|
||||||
const int64_t t_feature_upload_us = ggml_time_us();
|
|
||||||
if (kv_feature_backend != nullptr) {
|
if (kv_feature_backend != nullptr) {
|
||||||
ggml_backend_tensor_set_async(kv_feature_backend, lctx.dflash_kv_input_target_features, update_src, 0, ggml_nbytes(lctx.dflash_kv_input_target_features));
|
ggml_backend_tensor_set_async(kv_feature_backend, lctx.dflash.kv.cache_input_target_features, update_src, 0, ggml_nbytes(lctx.dflash.kv.cache_input_target_features));
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_tensor_set(lctx.dflash_kv_input_target_features, update_src, 0, ggml_nbytes(lctx.dflash_kv_input_target_features));
|
ggml_backend_tensor_set(lctx.dflash.kv.cache_input_target_features, update_src, 0, ggml_nbytes(lctx.dflash.kv.cache_input_target_features));
|
||||||
}
|
}
|
||||||
profile.graph_kv_cache_feature_upload_us += (uint64_t) (ggml_time_us() - t_feature_upload_us);
|
|
||||||
profile.graph_feature_bytes += (size_t) update_rows * (size_t) width * sizeof(float);
|
|
||||||
|
|
||||||
ggml_backend_t kv_pos_backend = llama_backend_for_tensor(lctx, lctx.dflash_kv_input_pos_ctx);
|
ggml_backend_t kv_pos_backend = llama_backend_for_tensor(lctx, lctx.dflash.kv.cache_input_pos_ctx);
|
||||||
const int64_t t_pos_upload_us = ggml_time_us();
|
|
||||||
if (kv_pos_backend != nullptr) {
|
if (kv_pos_backend != nullptr) {
|
||||||
ggml_backend_tensor_set_async(kv_pos_backend, lctx.dflash_kv_input_pos_ctx, update_pos, 0, ggml_nbytes(lctx.dflash_kv_input_pos_ctx));
|
ggml_backend_tensor_set_async(kv_pos_backend, lctx.dflash.kv.cache_input_pos_ctx, update_pos, 0, ggml_nbytes(lctx.dflash.kv.cache_input_pos_ctx));
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_tensor_set(lctx.dflash_kv_input_pos_ctx, update_pos, 0, ggml_nbytes(lctx.dflash_kv_input_pos_ctx));
|
ggml_backend_tensor_set(lctx.dflash.kv.cache_input_pos_ctx, update_pos, 0, ggml_nbytes(lctx.dflash.kv.cache_input_pos_ctx));
|
||||||
}
|
}
|
||||||
profile.graph_kv_cache_pos_upload_us += (uint64_t) (ggml_time_us() - t_pos_upload_us);
|
llama_graph_compute_sched(lctx, lctx.dflash.kv.cache_sched, gf_kv, lctx.cparams.n_threads);
|
||||||
|
ggml_backend_sched_synchronize(lctx.dflash.kv.cache_sched);
|
||||||
|
|
||||||
const int64_t t_kv_cache_us = ggml_time_us();
|
lctx.dflash.kv.cache_n_filled = std::min(cross_ctx, lctx.dflash.kv.cache_n_filled + update_rows);
|
||||||
llama_dflash_kv_node_profiler kv_node_profiler;
|
lctx.dflash.kv.cache_write_pos = (lctx.dflash.kv.cache_write_pos + update_rows) % cross_ctx;
|
||||||
if (kv_node_timing) {
|
lctx.dflash.kv.cache_applied_window_version = lctx.dflash.target.version;
|
||||||
kv_node_profiler.profile = &profile;
|
lctx.dflash.kv.cache_valid = true;
|
||||||
ggml_backend_sched_set_eval_callback(lctx.dflash_sched, llama_dflash_kv_node_eval_callback, &kv_node_profiler);
|
lctx.dflash.kv.cache_view_n_filled = lctx.dflash.kv.cache_n_filled;
|
||||||
}
|
lctx.dflash.kv.cache_view_write_pos = lctx.dflash.kv.cache_write_pos;
|
||||||
llama_graph_compute_sched(lctx, lctx.dflash_sched, gf_kv, lctx.cparams.n_threads);
|
lctx.dflash.kv.cache_view_valid = true;
|
||||||
if (kv_node_timing) {
|
|
||||||
ggml_backend_sched_set_eval_callback(lctx.dflash_sched, nullptr, nullptr);
|
|
||||||
}
|
|
||||||
profile.graph_kv_cache_compute_us += (uint64_t) (ggml_time_us() - t_kv_cache_us);
|
|
||||||
|
|
||||||
const int64_t t_sync_us = ggml_time_us();
|
|
||||||
ggml_backend_sched_synchronize(lctx.dflash_sched);
|
|
||||||
profile.graph_kv_cache_sync_us += (uint64_t) (ggml_time_us() - t_sync_us);
|
|
||||||
profile.graph_kv_cache_calls++;
|
|
||||||
|
|
||||||
lctx.dflash_kv_cache_n_filled = std::min(cross_ctx, lctx.dflash_kv_cache_n_filled + update_rows);
|
|
||||||
lctx.dflash_kv_cache_write_pos = (lctx.dflash_kv_cache_write_pos + update_rows) % cross_ctx;
|
|
||||||
lctx.dflash_kv_cache_applied_window_version = lctx.dflash_target_window_version;
|
|
||||||
lctx.dflash_kv_cache_valid = true;
|
|
||||||
lctx.dflash_kv_cache_view_n_filled = lctx.dflash_kv_cache_n_filled;
|
|
||||||
lctx.dflash_kv_cache_view_write_pos = lctx.dflash_kv_cache_write_pos;
|
|
||||||
lctx.dflash_kv_cache_view_valid = true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.dflash_kv_cache_view_valid &&
|
if (lctx.dflash.kv.cache_view_valid &&
|
||||||
!lctx.dflash_k_ctx_workspace.empty() && !lctx.dflash_v_ctx_workspace.empty()) {
|
!lctx.dflash.kv.k_ctx_workspace.empty() && !lctx.dflash.kv.v_ctx_workspace.empty()) {
|
||||||
const bool need_workspace_refresh = !lctx.dflash_kv_workspace_valid ||
|
const bool need_workspace_refresh = !lctx.dflash.kv.workspace_valid ||
|
||||||
lctx.dflash_kv_workspace_n_filled != lctx.dflash_kv_cache_view_n_filled ||
|
lctx.dflash.kv.workspace_n_filled != lctx.dflash.kv.cache_view_n_filled ||
|
||||||
lctx.dflash_kv_workspace_write_pos != lctx.dflash_kv_cache_view_write_pos ||
|
lctx.dflash.kv.workspace_write_pos != lctx.dflash.kv.cache_view_write_pos ||
|
||||||
lctx.dflash_kv_workspace_applied_window_version != lctx.dflash_kv_cache_applied_window_version;
|
lctx.dflash.kv.workspace_applied_window_version != lctx.dflash.kv.cache_applied_window_version;
|
||||||
|
|
||||||
if (need_workspace_refresh) {
|
if (need_workspace_refresh) {
|
||||||
const size_t max_nodes = lctx.model.max_nodes((int) std::max<int32_t>(1, cross_ctx)) + 16 * lctx.model.hparams.n_layer;
|
const size_t max_nodes = lctx.model.max_nodes((int) std::max<int32_t>(1, cross_ctx)) + 16 * lctx.model.hparams.n_layer;
|
||||||
const size_t meta_size = ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false);
|
const size_t meta_size = ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false);
|
||||||
if (lctx.dflash_workspace_buf_compute_meta.size() != meta_size) {
|
if (lctx.dflash.kv.workspace_compute_meta.size() != meta_size) {
|
||||||
lctx.dflash_workspace_buf_compute_meta.resize(meta_size);
|
lctx.dflash.kv.workspace_compute_meta.resize(meta_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_cgraph * gf_workspace = nullptr;
|
ggml_cgraph * gf_workspace = nullptr;
|
||||||
const bool can_reuse_workspace_graph = lctx.dflash_kv_workspace_graph != nullptr &&
|
const bool can_reuse_workspace_graph = lctx.dflash.kv.workspace_graph != nullptr &&
|
||||||
lctx.dflash_kv_workspace_graph_rows == lctx.dflash_kv_cache_view_n_filled &&
|
lctx.dflash.kv.workspace_graph_rows == lctx.dflash.kv.cache_view_n_filled &&
|
||||||
lctx.dflash_kv_workspace_graph_write_pos == lctx.dflash_kv_cache_view_write_pos;
|
lctx.dflash.kv.workspace_graph_write_pos == lctx.dflash.kv.cache_view_write_pos;
|
||||||
|
|
||||||
if (can_reuse_workspace_graph) {
|
if (can_reuse_workspace_graph) {
|
||||||
gf_workspace = lctx.dflash_kv_workspace_graph;
|
gf_workspace = lctx.dflash.kv.workspace_graph;
|
||||||
} else {
|
} else {
|
||||||
const int64_t t_build_us = ggml_time_us();
|
|
||||||
gf_workspace = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
gf_workspace = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
||||||
profile.graph_kv_workspace_build_us += (uint64_t) (ggml_time_us() - t_build_us);
|
|
||||||
if (gf_workspace == nullptr) {
|
if (gf_workspace == nullptr) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V workspace graph\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to build DFlash K/V workspace graph\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -733,95 +628,75 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.dflash_workspace_sched == nullptr) {
|
if (lctx.dflash.kv.workspace_sched == nullptr) {
|
||||||
lctx.dflash_workspace_sched = ggml_backend_sched_new(lctx.backends.data(), backend_buft.data(), lctx.backends.size(), max_nodes, false);
|
lctx.dflash.kv.workspace_sched = ggml_backend_sched_new(lctx.backends.data(), backend_buft.data(), lctx.backends.size(), max_nodes, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lctx.dflash_kv_workspace_reserved_rows != cross_ctx) {
|
if (lctx.dflash.kv.workspace_reserved_rows != cross_ctx) {
|
||||||
const bool saved_view_valid = lctx.dflash_kv_cache_view_valid;
|
const bool saved_view_valid = lctx.dflash.kv.cache_view_valid;
|
||||||
const int32_t saved_view_rows = lctx.dflash_kv_cache_view_n_filled;
|
const int32_t saved_view_rows = lctx.dflash.kv.cache_view_n_filled;
|
||||||
const int32_t saved_view_write_pos = lctx.dflash_kv_cache_view_write_pos;
|
const int32_t saved_view_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||||
|
|
||||||
lctx.dflash_kv_cache_view_valid = true;
|
lctx.dflash.kv.cache_view_valid = true;
|
||||||
lctx.dflash_kv_cache_view_n_filled = cross_ctx;
|
lctx.dflash.kv.cache_view_n_filled = cross_ctx;
|
||||||
lctx.dflash_kv_cache_view_write_pos = cross_ctx > 1 ? 1 : 0;
|
lctx.dflash.kv.cache_view_write_pos = cross_ctx > 1 ? 1 : 0;
|
||||||
|
|
||||||
const int64_t t_reserve_build_us = ggml_time_us();
|
|
||||||
ggml_cgraph * gf_workspace_reserve = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
ggml_cgraph * gf_workspace_reserve = llm_build_context::llama_build_graph_dflash_kv_workspace(lctx);
|
||||||
profile.graph_kv_workspace_build_us += (uint64_t) (ggml_time_us() - t_reserve_build_us);
|
|
||||||
|
|
||||||
lctx.dflash_kv_cache_view_valid = saved_view_valid;
|
lctx.dflash.kv.cache_view_valid = saved_view_valid;
|
||||||
lctx.dflash_kv_cache_view_n_filled = saved_view_rows;
|
lctx.dflash.kv.cache_view_n_filled = saved_view_rows;
|
||||||
lctx.dflash_kv_cache_view_write_pos = saved_view_write_pos;
|
lctx.dflash.kv.cache_view_write_pos = saved_view_write_pos;
|
||||||
|
|
||||||
const int64_t t_reserve_us = ggml_time_us();
|
const bool reserved = lctx.dflash.kv.workspace_sched != nullptr &&
|
||||||
const bool reserved = lctx.dflash_workspace_sched != nullptr &&
|
|
||||||
gf_workspace_reserve != nullptr &&
|
gf_workspace_reserve != nullptr &&
|
||||||
ggml_backend_sched_reserve(lctx.dflash_workspace_sched, gf_workspace_reserve);
|
ggml_backend_sched_reserve(lctx.dflash.kv.workspace_sched, gf_workspace_reserve);
|
||||||
profile.graph_kv_workspace_reserve_us += (uint64_t) (ggml_time_us() - t_reserve_us);
|
|
||||||
if (!reserved) {
|
if (!reserved) {
|
||||||
profile.graph_shape_failures++;
|
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize DFlash K/V workspace scheduler\n", __func__);
|
LLAMA_LOG_ERROR("%s: failed to initialize DFlash K/V workspace scheduler\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
lctx.dflash_kv_workspace_reserved_rows = cross_ctx;
|
lctx.dflash.kv.workspace_reserved_rows = cross_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_reset_us = ggml_time_us();
|
ggml_backend_sched_reset(lctx.dflash.kv.workspace_sched);
|
||||||
ggml_backend_sched_reset(lctx.dflash_workspace_sched);
|
ggml_backend_sched_alloc_graph(lctx.dflash.kv.workspace_sched, gf_workspace);
|
||||||
profile.graph_kv_workspace_reset_us += (uint64_t) (ggml_time_us() - t_reset_us);
|
|
||||||
|
|
||||||
const int64_t t_alloc_us = ggml_time_us();
|
lctx.dflash.kv.workspace_graph = gf_workspace;
|
||||||
ggml_backend_sched_alloc_graph(lctx.dflash_workspace_sched, gf_workspace);
|
lctx.dflash.kv.workspace_graph_rows = lctx.dflash.kv.cache_view_n_filled;
|
||||||
profile.graph_kv_workspace_alloc_us += (uint64_t) (ggml_time_us() - t_alloc_us);
|
lctx.dflash.kv.workspace_graph_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||||
|
|
||||||
lctx.dflash_kv_workspace_graph = gf_workspace;
|
|
||||||
lctx.dflash_kv_workspace_graph_rows = lctx.dflash_kv_cache_view_n_filled;
|
|
||||||
lctx.dflash_kv_workspace_graph_write_pos = lctx.dflash_kv_cache_view_write_pos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_workspace_us = ggml_time_us();
|
llama_graph_compute_sched(lctx, lctx.dflash.kv.workspace_sched, gf_workspace, lctx.cparams.n_threads);
|
||||||
llama_graph_compute_sched(lctx, lctx.dflash_workspace_sched, gf_workspace, lctx.cparams.n_threads);
|
lctx.dflash.kv.workspace_sync_pending = true;
|
||||||
profile.graph_kv_workspace_compute_us += (uint64_t) (ggml_time_us() - t_workspace_us);
|
|
||||||
lctx.dflash_kv_workspace_sync_pending = true;
|
|
||||||
profile.graph_kv_workspace_calls++;
|
|
||||||
|
|
||||||
lctx.dflash_kv_workspace_n_filled = lctx.dflash_kv_cache_view_n_filled;
|
lctx.dflash.kv.workspace_n_filled = lctx.dflash.kv.cache_view_n_filled;
|
||||||
lctx.dflash_kv_workspace_write_pos = lctx.dflash_kv_cache_view_write_pos;
|
lctx.dflash.kv.workspace_write_pos = lctx.dflash.kv.cache_view_write_pos;
|
||||||
lctx.dflash_kv_workspace_applied_window_version = lctx.dflash_kv_cache_applied_window_version;
|
lctx.dflash.kv.workspace_applied_window_version = lctx.dflash.kv.cache_applied_window_version;
|
||||||
lctx.dflash_kv_workspace_valid = true;
|
lctx.dflash.kv.workspace_valid = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_mask_us = ggml_time_us();
|
|
||||||
const int32_t full_visible_first = left_pad;
|
const int32_t full_visible_first = left_pad;
|
||||||
const int32_t full_visible_last = cross_ctx + (int32_t) n_tokens - 1;
|
const int32_t full_visible_last = cross_ctx + (int32_t) n_tokens - 1;
|
||||||
lctx.dflash_kq_mask_data.assign((size_t) n_kv_total * (size_t) n_mask_tokens, -INFINITY);
|
lctx.dflash.target.kq_mask_data.assign((size_t) n_kv_total * (size_t) n_mask_tokens, -INFINITY);
|
||||||
int32_t visible_kv_max = 0;
|
|
||||||
for (uint32_t j = 0; j < n_tokens; ++j) {
|
for (uint32_t j = 0; j < n_tokens; ++j) {
|
||||||
float * row = lctx.dflash_kq_mask_data.data() + (size_t) j * (size_t) n_kv_total;
|
float * row = lctx.dflash.target.kq_mask_data.data() + (size_t) j * (size_t) n_kv_total;
|
||||||
const int32_t visible_kv = cross_ctx + (int32_t) n_tokens;
|
|
||||||
visible_kv_max = std::max(visible_kv_max, visible_kv);
|
|
||||||
profile.graph_visible_kv_sum += (uint64_t) visible_kv;
|
|
||||||
for (int32_t i = full_visible_first; i <= full_visible_last; ++i) {
|
for (int32_t i = full_visible_first; i <= full_visible_last; ++i) {
|
||||||
row[i] = 0.0f;
|
row[i] = 0.0f;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(kq_mask, lctx.dflash_kq_mask_data.data(), 0, ggml_nbytes(kq_mask));
|
ggml_backend_tensor_set(kq_mask, lctx.dflash.target.kq_mask_data.data(), 0, ggml_nbytes(kq_mask));
|
||||||
profile.graph_mask_build_us += (uint64_t) (ggml_time_us() - t_mask_us);
|
|
||||||
profile.graph_mask_bytes += ggml_nbytes(kq_mask);
|
|
||||||
|
|
||||||
if (kq_mask_swa != nullptr) {
|
if (kq_mask_swa != nullptr) {
|
||||||
lctx.dflash_kq_mask_swa_data.assign((size_t) n_kv_total * (size_t) n_mask_tokens, -INFINITY);
|
lctx.dflash.target.kq_mask_swa_data.assign((size_t) n_kv_total * (size_t) n_mask_tokens, -INFINITY);
|
||||||
const int32_t swa_window = (int32_t) lctx.model.hparams.n_swa;
|
const int32_t swa_window = (int32_t) lctx.model.hparams.n_swa;
|
||||||
const int32_t draft_pos_base = (int32_t) profile.last_pos_last;
|
const int32_t draft_pos_base = (int32_t) last_target_pos;
|
||||||
for (uint32_t j = 0; j < n_tokens; ++j) {
|
for (uint32_t j = 0; j < n_tokens; ++j) {
|
||||||
float * row = lctx.dflash_kq_mask_swa_data.data() + (size_t) j * (size_t) n_kv_total;
|
float * row = lctx.dflash.target.kq_mask_swa_data.data() + (size_t) j * (size_t) n_kv_total;
|
||||||
const int32_t q_pos = draft_pos_base + (int32_t) j;
|
const int32_t q_pos = draft_pos_base + (int32_t) j;
|
||||||
|
|
||||||
for (int32_t k = left_pad; k < cross_ctx; ++k) {
|
for (int32_t k = left_pad; k < cross_ctx; ++k) {
|
||||||
const int32_t k_pos = (int32_t) lctx.dflash_pos_ctx_data[(size_t) k];
|
const int32_t k_pos = (int32_t) lctx.dflash.target.pos_ctx_data[(size_t) k];
|
||||||
if (q_pos - k_pos < swa_window) {
|
if (q_pos - k_pos < swa_window) {
|
||||||
row[k] = 0.0f;
|
row[k] = 0.0f;
|
||||||
}
|
}
|
||||||
@ -835,26 +710,7 @@ bool llama_prepare_dflash_graph_inputs(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_set(kq_mask_swa, lctx.dflash_kq_mask_swa_data.data(), 0, ggml_nbytes(kq_mask_swa));
|
ggml_backend_tensor_set(kq_mask_swa, lctx.dflash.target.kq_mask_swa_data.data(), 0, ggml_nbytes(kq_mask_swa));
|
||||||
profile.graph_mask_bytes += ggml_nbytes(kq_mask_swa);
|
|
||||||
}
|
|
||||||
|
|
||||||
profile.graph_visible_kv_max = std::max<uint64_t>(profile.graph_visible_kv_max, (uint64_t) visible_kv_max);
|
|
||||||
profile.graph_prepare_total_us += (uint64_t) (ggml_time_us() - t_total_us);
|
|
||||||
|
|
||||||
if (profile.graph_prepare_calls == 1 && llama_dflash_stats_log_enabled()) {
|
|
||||||
int32_t n_swa_layers = 0;
|
|
||||||
for (int32_t il = 0; il < lctx.model.hparams.n_layer; ++il) {
|
|
||||||
n_swa_layers += lctx.model.hparams.swa_layers[(size_t) il] ? 1 : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: DFlash graph contract rows=%d width=%d cross_ctx=%d n_tokens=%u left_pad=%d n_kv_total=%d draft_n_ctx=%u pos=%s [%d..%d] full_mask=[%d..%d] swa_window=%u swa_layers=%d\n",
|
|
||||||
__func__, n_rows, width, cross_ctx, n_tokens, left_pad, n_kv_total, lctx.cparams.n_ctx,
|
|
||||||
(src_pos != nullptr && total_positions == (size_t) n_rows) ? "target" : "synthetic",
|
|
||||||
(int) profile.last_pos_first, (int) profile.last_pos_last,
|
|
||||||
full_visible_first, full_visible_last,
|
|
||||||
lctx.model.hparams.n_swa,
|
|
||||||
n_swa_layers);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -1,55 +1,13 @@
|
|||||||
#include "llama-spec-features.h"
|
#include "llama-spec-features.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <atomic>
|
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <random>
|
#include <random>
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
|
|
||||||
static bool llama_dflash_stats_log_enabled() {
|
|
||||||
const char * env = std::getenv("IK_DFLASH_STATS_LOG");
|
|
||||||
return env != nullptr && *env != '\0' &&
|
|
||||||
std::strcmp(env, "0") != 0 &&
|
|
||||||
std::strcmp(env, "false") != 0 &&
|
|
||||||
std::strcmp(env, "off") != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_dflash_positions_strictly_increasing(
|
|
||||||
const llama_pos * positions,
|
|
||||||
int32_t n_rows,
|
|
||||||
llama_pos & first_pos,
|
|
||||||
llama_pos & last_pos) {
|
|
||||||
first_pos = -1;
|
|
||||||
last_pos = -1;
|
|
||||||
|
|
||||||
if (positions == nullptr || n_rows <= 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
first_pos = positions[0];
|
|
||||||
last_pos = positions[n_rows - 1];
|
|
||||||
|
|
||||||
for (int32_t i = 1; i < n_rows; ++i) {
|
|
||||||
if (positions[i] <= positions[i - 1]) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_dflash_profile_reset(struct llama_context * ctx) {
|
|
||||||
if (ctx == nullptr) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ctx->dflash.profile = {};
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_reset_dflash_kv_cache_state(struct llama_context * ctx) {
|
void llama_reset_dflash_kv_cache_state(struct llama_context * ctx) {
|
||||||
if (ctx == nullptr) {
|
if (ctx == nullptr) {
|
||||||
return;
|
return;
|
||||||
@ -120,17 +78,6 @@ int32_t llama_get_dflash_visible_cross_ctx(
|
|||||||
return ctx != nullptr ? ctx->dflash.visible_cross_ctx : 0;
|
return ctx != nullptr ? ctx->dflash.visible_cross_ctx : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool llama_dflash_profile_get_stats(
|
|
||||||
const struct llama_context * ctx,
|
|
||||||
llama_dflash_profile_stats * stats) {
|
|
||||||
if (ctx == nullptr || stats == nullptr) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
*stats = ctx->dflash.profile;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llama_model_dflash_block_size(const struct llama_model * model) {
|
int32_t llama_model_dflash_block_size(const struct llama_model * model) {
|
||||||
return model ? (int32_t) model->hparams.dflash_block_size : 0;
|
return model ? (int32_t) model->hparams.dflash_block_size : 0;
|
||||||
}
|
}
|
||||||
@ -188,48 +135,6 @@ const struct ggml_tensor * llama_model_dflash_output_tensor(
|
|||||||
return model->tok_embd;
|
return model->tok_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * llama_dflash_io_mode_name(int32_t io_mode) {
|
|
||||||
switch (io_mode) {
|
|
||||||
case LLAMA_DFLASH_IO_MODE_SHARED:
|
|
||||||
return "shared";
|
|
||||||
case LLAMA_DFLASH_IO_MODE_SELF_CONTAINED:
|
|
||||||
return "self-contained";
|
|
||||||
case LLAMA_DFLASH_IO_MODE_MIXED:
|
|
||||||
return "mixed";
|
|
||||||
default:
|
|
||||||
return "invalid";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * llama_dflash_output_head_kind(
|
|
||||||
const struct llama_model * draft_model,
|
|
||||||
const struct llama_model * target_model) {
|
|
||||||
const struct ggml_tensor * output = llama_model_dflash_output_tensor(draft_model);
|
|
||||||
if (output == nullptr) {
|
|
||||||
return "missing";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (output == draft_model->tok_embd) {
|
|
||||||
return draft_model->tok_embd == (target_model ? target_model->tok_embd : nullptr)
|
|
||||||
? "shared_token_embedding"
|
|
||||||
: "token_embedding";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (draft_model->output_mtp != nullptr && output == draft_model->output_mtp) {
|
|
||||||
if (target_model != nullptr && target_model->output_mtp != nullptr && output == target_model->output_mtp) {
|
|
||||||
return "output_mtp";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (std::strcmp(output->name, "output_extra.weight") == 0) {
|
|
||||||
return "output_extra";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "output_mtp";
|
|
||||||
}
|
|
||||||
|
|
||||||
return "output";
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llama_model_dflash_io_mode(
|
int32_t llama_model_dflash_io_mode(
|
||||||
const struct llama_model * draft_model,
|
const struct llama_model * draft_model,
|
||||||
const struct llama_model * target_model) {
|
const struct llama_model * target_model) {
|
||||||
@ -302,19 +207,6 @@ bool llama_model_share_dflash_io_tensors(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const struct ggml_tensor * output = llama_model_dflash_output_tensor(draft_model);
|
const struct ggml_tensor * output = llama_model_dflash_output_tensor(draft_model);
|
||||||
if (draft_model->tok_embd != nullptr && output != nullptr) {
|
|
||||||
LLAMA_LOG_INFO("%s: DFlash ready io=%s output_head=%s\n",
|
|
||||||
__func__,
|
|
||||||
llama_dflash_io_mode_name(llama_model_dflash_io_mode(draft_model, target_model)),
|
|
||||||
llama_dflash_output_head_kind(draft_model, target_model));
|
|
||||||
if (llama_dflash_stats_log_enabled()) {
|
|
||||||
LLAMA_LOG_INFO("%s: DFlash IO tensor=%s type=%s\n",
|
|
||||||
__func__,
|
|
||||||
output->name[0] != '\0' ? output->name : "(unnamed)",
|
|
||||||
ggml_type_name(output->type));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return draft_model->tok_embd != nullptr && output != nullptr;
|
return draft_model->tok_embd != nullptr && output != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -336,14 +228,6 @@ static bool llama_set_dflash_target_features_impl(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & profile = ctx->dflash.profile;
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
|
||||||
const int32_t row_width = have_full_features
|
|
||||||
? (n_rows > 0 ? (int32_t) (n_floats / (size_t) n_rows) : 0)
|
|
||||||
: (window_update->append_rows > 0 ? (int32_t) (window_update->append_floats / (size_t) window_update->append_rows) : 0);
|
|
||||||
llama_pos first_pos = -1;
|
|
||||||
llama_pos last_pos = -1;
|
|
||||||
|
|
||||||
if (have_full_features && copy_data) {
|
if (have_full_features && copy_data) {
|
||||||
ctx->dflash.target.features_owned.assign(target_features, target_features + n_floats);
|
ctx->dflash.target.features_owned.assign(target_features, target_features + n_floats);
|
||||||
ctx->dflash.target.features = ctx->dflash.target.features_owned.data();
|
ctx->dflash.target.features = ctx->dflash.target.features_owned.data();
|
||||||
@ -424,28 +308,6 @@ static bool llama_set_dflash_target_features_impl(
|
|||||||
ctx->dflash.target.positions_n = 0;
|
ctx->dflash.target.positions_n = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
profile.set_target_copy_calls++;
|
|
||||||
profile.set_target_copy_us += (uint64_t) (ggml_time_us() - t_start_us);
|
|
||||||
profile.set_target_rows += (uint64_t) n_rows;
|
|
||||||
profile.set_target_copy_bytes +=
|
|
||||||
(have_full_features ? n_floats : 0) * sizeof(float) +
|
|
||||||
(have_append_features ? window_update->append_floats : 0) * sizeof(float) +
|
|
||||||
(target_positions ? (size_t) n_rows * sizeof(llama_pos) : 0);
|
|
||||||
profile.last_n_rows = n_rows;
|
|
||||||
profile.last_width = row_width;
|
|
||||||
|
|
||||||
if (target_positions == nullptr) {
|
|
||||||
profile.set_target_missing_positions++;
|
|
||||||
profile.last_pos_first = -1;
|
|
||||||
profile.last_pos_last = -1;
|
|
||||||
} else {
|
|
||||||
if (!llama_dflash_positions_strictly_increasing(target_positions, n_rows, first_pos, last_pos)) {
|
|
||||||
profile.set_target_non_monotonic_positions++;
|
|
||||||
}
|
|
||||||
profile.last_pos_first = first_pos;
|
|
||||||
profile.last_pos_last = last_pos;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -469,35 +331,6 @@ bool llama_set_dflash_target_features_view(
|
|||||||
return llama_set_dflash_target_features_impl(ctx, target_features, n_floats, n_rows, target_positions, false, window_update);
|
return llama_set_dflash_target_features_impl(ctx, target_features, n_floats, n_rows, target_positions, false, window_update);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_record_dflash_capture_phase(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
bool is_prompt_warmup,
|
|
||||||
int32_t row_count,
|
|
||||||
int32_t row_width) {
|
|
||||||
if (ctx == nullptr || row_count <= 0 || row_width <= 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto & profile = ctx->dflash.profile;
|
|
||||||
if (is_prompt_warmup) {
|
|
||||||
profile.capture_prompt_batches++;
|
|
||||||
if (profile.capture_prompt_last_rows > 0 && profile.capture_prompt_last_width > 0 &&
|
|
||||||
(profile.capture_prompt_last_rows != row_count || profile.capture_prompt_last_width != row_width)) {
|
|
||||||
profile.capture_prompt_shape_changes++;
|
|
||||||
}
|
|
||||||
profile.capture_prompt_last_rows = row_count;
|
|
||||||
profile.capture_prompt_last_width = row_width;
|
|
||||||
} else {
|
|
||||||
profile.capture_verify_batches++;
|
|
||||||
if (profile.capture_verify_last_rows > 0 && profile.capture_verify_last_width > 0 &&
|
|
||||||
(profile.capture_verify_last_rows != row_count || profile.capture_verify_last_width != row_width)) {
|
|
||||||
profile.capture_verify_shape_changes++;
|
|
||||||
}
|
|
||||||
profile.capture_verify_last_rows = row_count;
|
|
||||||
profile.capture_verify_last_width = row_width;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_dflash_parse_layer_id(const struct ggml_tensor * tensor, int32_t & layer_id) {
|
static bool llama_dflash_parse_layer_id(const struct ggml_tensor * tensor, int32_t & layer_id) {
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
@ -644,9 +477,8 @@ void llama_finish_dflash_capture_batch(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(is_prompt_warmup);
|
||||||
auto & capture = *ctx->dflash.capture;
|
auto & capture = *ctx->dflash.capture;
|
||||||
llama_record_dflash_capture_phase(ctx, is_prompt_warmup, capture.row_count, capture.row_width);
|
|
||||||
|
|
||||||
// Reset the batch-local reference shape so the next decode only compares layers within
|
// Reset the batch-local reference shape so the next decode only compares layers within
|
||||||
// the same batch, not against the previous prompt/verify batch.
|
// the same batch, not against the previous prompt/verify batch.
|
||||||
capture.row_count = 0;
|
capture.row_count = 0;
|
||||||
@ -662,59 +494,42 @@ static bool llama_spec_prepare_dflash_capture(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & profile = ctx->dflash.profile;
|
|
||||||
profile.capture_prepare_calls++;
|
|
||||||
const int64_t t_sync_us = ggml_time_us();
|
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
profile.capture_prepare_sync_us += (uint64_t) (ggml_time_us() - t_sync_us);
|
|
||||||
|
|
||||||
auto & capture = *ctx->dflash.capture;
|
auto & capture = *ctx->dflash.capture;
|
||||||
row_count = capture.row_count;
|
row_count = capture.row_count;
|
||||||
row_width = capture.row_width;
|
row_width = capture.row_width;
|
||||||
n_layers = (int32_t) capture.layer_ids.size();
|
n_layers = (int32_t) capture.layer_ids.size();
|
||||||
if (row_count <= 0 || row_width <= 0 || n_layers <= 0 || capture.layer_rows.size() != (size_t) n_layers) {
|
if (row_count <= 0 || row_width <= 0 || n_layers <= 0 || capture.layer_rows.size() != (size_t) n_layers) {
|
||||||
profile.capture_prepare_failures++;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (capture.capture_batch_id == 0 || capture.layer_seen_batch_id.size() != (size_t) n_layers) {
|
if (capture.capture_batch_id == 0 || capture.layer_seen_batch_id.size() != (size_t) n_layers) {
|
||||||
profile.capture_prepare_failures++;
|
LLAMA_LOG_WARN("%s: DFlash capture batch markers are not initialized (batch_id=%llu layers=%zu expected=%d)\n",
|
||||||
profile.capture_layer_batch_mismatch++;
|
__func__,
|
||||||
if (profile.capture_layer_batch_mismatch <= 3) {
|
(unsigned long long) capture.capture_batch_id,
|
||||||
LLAMA_LOG_WARN("%s: DFlash capture batch markers are not initialized (batch_id=%llu layers=%zu expected=%d)\n",
|
capture.layer_seen_batch_id.size(),
|
||||||
__func__,
|
n_layers);
|
||||||
(unsigned long long) capture.capture_batch_id,
|
|
||||||
capture.layer_seen_batch_id.size(),
|
|
||||||
n_layers);
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int32_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) {
|
for (int32_t layer_idx = 0; layer_idx < n_layers; ++layer_idx) {
|
||||||
if (capture.layer_seen_batch_id[(size_t) layer_idx] != capture.capture_batch_id) {
|
if (capture.layer_seen_batch_id[(size_t) layer_idx] != capture.capture_batch_id) {
|
||||||
profile.capture_prepare_failures++;
|
LLAMA_LOG_WARN("%s: DFlash capture is stale for layer %d (seen_batch=%llu current_batch=%llu rows=%d width=%d)\n",
|
||||||
profile.capture_layer_batch_mismatch++;
|
__func__,
|
||||||
if (profile.capture_layer_batch_mismatch <= 3) {
|
capture.layer_ids[(size_t) layer_idx],
|
||||||
LLAMA_LOG_WARN("%s: DFlash capture is stale for layer %d (seen_batch=%llu current_batch=%llu rows=%d width=%d)\n",
|
(unsigned long long) capture.layer_seen_batch_id[(size_t) layer_idx],
|
||||||
__func__,
|
(unsigned long long) capture.capture_batch_id,
|
||||||
capture.layer_ids[(size_t) layer_idx],
|
row_count,
|
||||||
(unsigned long long) capture.layer_seen_batch_id[(size_t) layer_idx],
|
row_width);
|
||||||
(unsigned long long) capture.capture_batch_id,
|
|
||||||
row_count,
|
|
||||||
row_width);
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto & rows = capture.layer_rows[(size_t) layer_idx];
|
const auto & rows = capture.layer_rows[(size_t) layer_idx];
|
||||||
if (rows.size() != (size_t) row_count * (size_t) row_width) {
|
if (rows.size() != (size_t) row_count * (size_t) row_width) {
|
||||||
profile.capture_prepare_failures++;
|
LLAMA_LOG_WARN("%s: DFlash capture rows mismatch for layer %d: got=%zu expected=%zu (rows=%d width=%d)\n",
|
||||||
profile.capture_layer_shape_mismatch++;
|
__func__, capture.layer_ids[(size_t) layer_idx], rows.size(),
|
||||||
if (profile.capture_layer_shape_mismatch <= 3) {
|
(size_t) row_count * (size_t) row_width, row_count, row_width);
|
||||||
LLAMA_LOG_WARN("%s: DFlash capture rows mismatch for layer %d: got=%zu expected=%zu (rows=%d width=%d)\n",
|
|
||||||
__func__, capture.layer_ids[(size_t) layer_idx], rows.size(),
|
|
||||||
(size_t) row_count * (size_t) row_width, row_count, row_width);
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -722,194 +537,6 @@ static bool llama_spec_prepare_dflash_capture(
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_dflash_contract_log_enabled() {
|
|
||||||
const char * env = std::getenv("IK_DFLASH_CONTRACT_LOG");
|
|
||||||
if (env == nullptr || *env == '\0') {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::strcmp(env, "0") != 0 &&
|
|
||||||
std::strcmp(env, "false") != 0 &&
|
|
||||||
std::strcmp(env, "off") != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
static std::string llama_dflash_contract_format_values(
|
|
||||||
const std::vector<T> & values,
|
|
||||||
size_t edge_count = 4) {
|
|
||||||
std::ostringstream oss;
|
|
||||||
oss << '[';
|
|
||||||
if (values.empty()) {
|
|
||||||
oss << ']';
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t head = std::min(edge_count, values.size());
|
|
||||||
for (size_t i = 0; i < head; ++i) {
|
|
||||||
if (i > 0) {
|
|
||||||
oss << ',';
|
|
||||||
}
|
|
||||||
oss << values[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (values.size() > edge_count * 2) {
|
|
||||||
oss << ",...,";
|
|
||||||
for (size_t i = values.size() - edge_count; i < values.size(); ++i) {
|
|
||||||
if (i > values.size() - edge_count) {
|
|
||||||
oss << ',';
|
|
||||||
}
|
|
||||||
oss << values[i];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (size_t i = head; i < values.size(); ++i) {
|
|
||||||
oss << ',' << values[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
oss << ']';
|
|
||||||
return oss.str();
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::vector<llama_pos> llama_dflash_contract_collect_batch_positions(
|
|
||||||
const llama_batch & batch,
|
|
||||||
const std::vector<int32_t> & batch_indices) {
|
|
||||||
std::vector<llama_pos> positions;
|
|
||||||
positions.reserve(batch_indices.size());
|
|
||||||
for (int32_t batch_index : batch_indices) {
|
|
||||||
positions.push_back(batch.pos[batch_index]);
|
|
||||||
}
|
|
||||||
return positions;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_dflash_contract_summarize_positions(
|
|
||||||
const std::vector<llama_pos> & positions,
|
|
||||||
llama_pos & first_pos,
|
|
||||||
llama_pos & last_pos,
|
|
||||||
int32_t & gap_count,
|
|
||||||
int32_t & nonmono_count) {
|
|
||||||
first_pos = -1;
|
|
||||||
last_pos = -1;
|
|
||||||
gap_count = 0;
|
|
||||||
nonmono_count = 0;
|
|
||||||
if (positions.empty()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
first_pos = positions.front();
|
|
||||||
last_pos = positions.back();
|
|
||||||
for (size_t i = 1; i < positions.size(); ++i) {
|
|
||||||
if (positions[i] <= positions[i - 1]) {
|
|
||||||
nonmono_count++;
|
|
||||||
} else if (positions[i] != positions[i - 1] + 1) {
|
|
||||||
gap_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_dflash_contract_log_feature_view(
|
|
||||||
const char * kind,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
const llama_batch & batch,
|
|
||||||
int32_t row_count,
|
|
||||||
int32_t row_width,
|
|
||||||
int32_t n_layers,
|
|
||||||
int32_t batch_row_offset,
|
|
||||||
const std::vector<int32_t> & row_indices,
|
|
||||||
const std::vector<int32_t> & batch_indices) {
|
|
||||||
if (!llama_dflash_contract_log_enabled()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> counter = 0;
|
|
||||||
const uint64_t ordinal = counter.fetch_add(1, std::memory_order_relaxed);
|
|
||||||
if (ordinal >= 8) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::vector<llama_pos> positions = llama_dflash_contract_collect_batch_positions(batch, batch_indices);
|
|
||||||
llama_pos first_pos = -1;
|
|
||||||
llama_pos last_pos = -1;
|
|
||||||
int32_t gap_count = 0;
|
|
||||||
int32_t nonmono_count = 0;
|
|
||||||
llama_dflash_contract_summarize_positions(positions, first_pos, last_pos, gap_count, nonmono_count);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s[%llu]: kind=%s seq=%d batch_tokens=%d capture_rows=%d row_width=%d layers=%d batch_row_offset=%d row_indices=%s batch_indices=%s batch_pos=%s pos=[%d..%d] gaps=%d nonmono=%d\n",
|
|
||||||
__func__,
|
|
||||||
(unsigned long long) (ordinal + 1),
|
|
||||||
kind,
|
|
||||||
(int) seq_id,
|
|
||||||
batch.n_tokens,
|
|
||||||
row_count,
|
|
||||||
row_width,
|
|
||||||
n_layers,
|
|
||||||
batch_row_offset,
|
|
||||||
llama_dflash_contract_format_values(row_indices).c_str(),
|
|
||||||
llama_dflash_contract_format_values(batch_indices).c_str(),
|
|
||||||
llama_dflash_contract_format_values(positions).c_str(),
|
|
||||||
(int) first_pos,
|
|
||||||
(int) last_pos,
|
|
||||||
gap_count,
|
|
||||||
nonmono_count);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llama_dflash_contract_log_output_indices(
|
|
||||||
struct llama_context * ctx,
|
|
||||||
const std::vector<int32_t> & output_indices) {
|
|
||||||
if (!llama_dflash_contract_log_enabled()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> counter = 0;
|
|
||||||
const uint64_t ordinal = counter.fetch_add(1, std::memory_order_relaxed);
|
|
||||||
if (ordinal >= 8) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t row_count = 0;
|
|
||||||
int32_t row_width = 0;
|
|
||||||
int32_t n_layers = 0;
|
|
||||||
const bool have_capture = llama_spec_prepare_dflash_capture(ctx, row_count, row_width, n_layers);
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s[%llu]: output_indices=%s capture_rows=%d row_width=%d layers=%d have_capture=%s\n",
|
|
||||||
__func__,
|
|
||||||
(unsigned long long) (ordinal + 1),
|
|
||||||
llama_dflash_contract_format_values(output_indices).c_str(),
|
|
||||||
row_count,
|
|
||||||
row_width,
|
|
||||||
n_layers,
|
|
||||||
have_capture ? "true" : "false");
|
|
||||||
}
|
|
||||||
|
|
||||||
void llama_dflash_contract_log_accept(
|
|
||||||
int slot_id,
|
|
||||||
bool is_dflash,
|
|
||||||
const char * path,
|
|
||||||
bool any_rejected,
|
|
||||||
size_t n_draft,
|
|
||||||
size_t n_accepted,
|
|
||||||
llama_pos pos_base,
|
|
||||||
const std::vector<int32_t> & output_indices) {
|
|
||||||
if (!llama_dflash_contract_log_enabled() || !is_dflash) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::atomic<uint64_t> counter = 0;
|
|
||||||
const uint64_t ordinal = counter.fetch_add(1, std::memory_order_relaxed);
|
|
||||||
if (ordinal >= 8) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
LLAMA_LOG_INFO("dflash contract accept[%llu]: slot=%d path=%s rejected=%s drafted=%zu accepted=%zu pos_base=%d output_indices=%s\n",
|
|
||||||
(unsigned long long) (ordinal + 1),
|
|
||||||
slot_id,
|
|
||||||
path,
|
|
||||||
any_rejected ? "true" : "false",
|
|
||||||
n_draft,
|
|
||||||
n_accepted,
|
|
||||||
(int) pos_base,
|
|
||||||
llama_dflash_contract_format_values(output_indices).c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool llama_spec_materialize_dflash_rows_prepared(
|
static bool llama_spec_materialize_dflash_rows_prepared(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
int32_t row_count,
|
int32_t row_count,
|
||||||
@ -928,9 +555,6 @@ static bool llama_spec_materialize_dflash_rows(
|
|||||||
int32_t row_width = 0;
|
int32_t row_width = 0;
|
||||||
int32_t n_layers = 0;
|
int32_t n_layers = 0;
|
||||||
if (!llama_spec_prepare_dflash_capture(ctx, row_count, row_width, n_layers)) {
|
if (!llama_spec_prepare_dflash_capture(ctx, row_count, row_width, n_layers)) {
|
||||||
if (ctx != nullptr) {
|
|
||||||
ctx->dflash.profile.capture_materialize_failures++;
|
|
||||||
}
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -951,12 +575,7 @@ static bool llama_spec_materialize_dflash_rows_prepared(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto & profile = ctx->dflash.profile;
|
|
||||||
profile.capture_materialize_calls++;
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
if (row_count <= 0 || row_width <= 0 || n_layers <= 0 || ctx->dflash.capture == nullptr) {
|
if (row_count <= 0 || row_width <= 0 || n_layers <= 0 || ctx->dflash.capture == nullptr) {
|
||||||
profile.capture_materialize_failures++;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -972,7 +591,6 @@ static bool llama_spec_materialize_dflash_rows_prepared(
|
|||||||
if (row_index < 0 || row_index >= row_count) {
|
if (row_index < 0 || row_index >= row_count) {
|
||||||
rows_out.clear();
|
rows_out.clear();
|
||||||
combined_width = 0;
|
combined_width = 0;
|
||||||
profile.capture_materialize_failures++;
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -983,10 +601,6 @@ static bool llama_spec_materialize_dflash_rows_prepared(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
profile.capture_materialize_us += (uint64_t) (ggml_time_us() - t_start_us);
|
|
||||||
profile.capture_materialize_rows += (uint64_t) row_indices.size();
|
|
||||||
profile.capture_materialize_bytes += rows_out.size() * sizeof(float);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1040,17 +654,6 @@ bool llama_spec_get_dflash_feature_view(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_dflash_contract_log_feature_view(
|
|
||||||
"batch",
|
|
||||||
view.rows.empty() ? -1 : view.rows.front().seq_id,
|
|
||||||
batch,
|
|
||||||
row_count,
|
|
||||||
row_width,
|
|
||||||
n_layers,
|
|
||||||
batch_row_offset,
|
|
||||||
row_indices,
|
|
||||||
batch_indices);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1109,17 +712,6 @@ bool llama_spec_get_dflash_feature_view_for_seq(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_dflash_contract_log_feature_view(
|
|
||||||
"seq",
|
|
||||||
seq_id,
|
|
||||||
batch,
|
|
||||||
row_count,
|
|
||||||
row_width,
|
|
||||||
n_layers,
|
|
||||||
batch_row_offset,
|
|
||||||
row_indices,
|
|
||||||
batch_indices);
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1133,7 +725,5 @@ bool llama_spec_copy_dflash_rows_from_output_indices(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_dflash_contract_log_output_indices(ctx, output_indices);
|
|
||||||
|
|
||||||
return hidden_rows.size() == (size_t) output_indices.size() * (size_t) combined_width;
|
return hidden_rows.size() == (size_t) output_indices.size() * (size_t) combined_width;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -11,147 +11,6 @@ struct llama_model;
|
|||||||
struct ggml_tensor;
|
struct ggml_tensor;
|
||||||
struct llama_spec_feature_view;
|
struct llama_spec_feature_view;
|
||||||
|
|
||||||
struct llama_dflash_profile_stats {
|
|
||||||
uint64_t decode_internal_chunks = 0;
|
|
||||||
uint64_t decode_graph_rebuilds = 0;
|
|
||||||
uint64_t decode_sync_profile_points = 0;
|
|
||||||
uint64_t decode_prelude_us = 0;
|
|
||||||
uint64_t decode_sched_reset_us = 0;
|
|
||||||
uint64_t decode_build_graph_us = 0;
|
|
||||||
uint64_t decode_sched_alloc_graph_us = 0;
|
|
||||||
uint64_t decode_set_inputs_us = 0;
|
|
||||||
uint64_t decode_graph_compute_us = 0;
|
|
||||||
uint64_t decode_result_us = 0;
|
|
||||||
uint64_t decode_embedding_us = 0;
|
|
||||||
uint64_t decode_final_sched_reset_us = 0;
|
|
||||||
|
|
||||||
uint64_t decode_output_reserve_calls = 0;
|
|
||||||
uint64_t decode_output_reserve_us = 0;
|
|
||||||
uint64_t decode_output_reserve_reallocs = 0;
|
|
||||||
uint64_t decode_output_reserve_realloc_bytes = 0;
|
|
||||||
uint64_t decode_prepare_calls = 0;
|
|
||||||
uint64_t decode_prepare_us = 0;
|
|
||||||
uint64_t decode_prepare_failures = 0;
|
|
||||||
|
|
||||||
uint64_t set_target_copy_calls = 0;
|
|
||||||
uint64_t set_target_copy_us = 0;
|
|
||||||
uint64_t set_target_rows = 0;
|
|
||||||
uint64_t set_target_copy_bytes = 0;
|
|
||||||
uint64_t set_target_missing_positions = 0;
|
|
||||||
uint64_t set_target_non_monotonic_positions = 0;
|
|
||||||
|
|
||||||
uint64_t capture_prepare_calls = 0;
|
|
||||||
uint64_t capture_prepare_sync_us = 0;
|
|
||||||
uint64_t capture_prepare_failures = 0;
|
|
||||||
uint64_t capture_layer_shape_mismatch = 0;
|
|
||||||
uint64_t capture_layer_batch_mismatch = 0;
|
|
||||||
uint64_t capture_prompt_batches = 0;
|
|
||||||
uint64_t capture_prompt_shape_changes = 0;
|
|
||||||
uint64_t capture_verify_batches = 0;
|
|
||||||
uint64_t capture_verify_shape_changes = 0;
|
|
||||||
uint64_t capture_materialize_calls = 0;
|
|
||||||
uint64_t capture_materialize_rows = 0;
|
|
||||||
uint64_t capture_materialize_bytes = 0;
|
|
||||||
uint64_t capture_materialize_us = 0;
|
|
||||||
uint64_t capture_materialize_failures = 0;
|
|
||||||
|
|
||||||
uint64_t graph_prepare_calls = 0;
|
|
||||||
uint64_t graph_prepare_total_us = 0;
|
|
||||||
uint64_t graph_feature_copy_us = 0;
|
|
||||||
uint64_t graph_pos_copy_us = 0;
|
|
||||||
uint64_t graph_mask_build_us = 0;
|
|
||||||
uint64_t graph_kv_cache_build_us = 0;
|
|
||||||
uint64_t graph_kv_cache_reserve_us = 0;
|
|
||||||
uint64_t graph_kv_cache_reset_us = 0;
|
|
||||||
uint64_t graph_kv_cache_alloc_us = 0;
|
|
||||||
uint64_t graph_kv_cache_feature_upload_us = 0;
|
|
||||||
uint64_t graph_kv_cache_pos_upload_us = 0;
|
|
||||||
uint64_t graph_kv_cache_compute_us = 0;
|
|
||||||
uint64_t graph_kv_cache_sync_us = 0;
|
|
||||||
uint64_t graph_kv_cache_read_concat_pad_us = 0;
|
|
||||||
uint64_t graph_kv_cache_read_concat_pad_calls = 0;
|
|
||||||
uint64_t graph_kv_cache_cached_bytes = 0;
|
|
||||||
uint64_t graph_kv_cache_calls = 0;
|
|
||||||
uint64_t graph_kv_workspace_build_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_reserve_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_reset_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_alloc_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_compute_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_sync_us = 0;
|
|
||||||
uint64_t graph_kv_workspace_calls = 0;
|
|
||||||
uint64_t graph_kv_node_fused_target_calls = 0;
|
|
||||||
uint64_t graph_kv_node_fused_target_us = 0;
|
|
||||||
uint64_t graph_kv_node_k_proj_calls = 0;
|
|
||||||
uint64_t graph_kv_node_k_proj_us = 0;
|
|
||||||
uint64_t graph_kv_node_k_norm_calls = 0;
|
|
||||||
uint64_t graph_kv_node_k_norm_us = 0;
|
|
||||||
uint64_t graph_kv_node_k_rope_calls = 0;
|
|
||||||
uint64_t graph_kv_node_k_rope_us = 0;
|
|
||||||
uint64_t graph_kv_node_v_proj_calls = 0;
|
|
||||||
uint64_t graph_kv_node_v_proj_us = 0;
|
|
||||||
uint64_t graph_kv_node_k_store_calls = 0;
|
|
||||||
uint64_t graph_kv_node_k_store_us = 0;
|
|
||||||
uint64_t graph_kv_node_v_store_calls = 0;
|
|
||||||
uint64_t graph_kv_node_v_store_us = 0;
|
|
||||||
uint64_t graph_main_node_qcur_calls = 0;
|
|
||||||
uint64_t graph_main_node_qcur_us = 0;
|
|
||||||
uint64_t graph_main_node_k_draft_calls = 0;
|
|
||||||
uint64_t graph_main_node_k_draft_us = 0;
|
|
||||||
uint64_t graph_main_node_v_draft_calls = 0;
|
|
||||||
uint64_t graph_main_node_v_draft_us = 0;
|
|
||||||
uint64_t graph_main_node_k_ctx_view_calls = 0;
|
|
||||||
uint64_t graph_main_node_k_ctx_view_us = 0;
|
|
||||||
uint64_t graph_main_node_v_ctx_view_calls = 0;
|
|
||||||
uint64_t graph_main_node_v_ctx_view_us = 0;
|
|
||||||
uint64_t graph_main_node_k_concat_calls = 0;
|
|
||||||
uint64_t graph_main_node_k_concat_us = 0;
|
|
||||||
uint64_t graph_main_node_v_concat_calls = 0;
|
|
||||||
uint64_t graph_main_node_v_concat_us = 0;
|
|
||||||
uint64_t graph_main_node_k_pad_calls = 0;
|
|
||||||
uint64_t graph_main_node_k_pad_us = 0;
|
|
||||||
uint64_t graph_main_node_v_pad_calls = 0;
|
|
||||||
uint64_t graph_main_node_v_pad_us = 0;
|
|
||||||
uint64_t graph_main_node_k_perm_cont_calls = 0;
|
|
||||||
uint64_t graph_main_node_k_perm_cont_us = 0;
|
|
||||||
uint64_t graph_main_node_v_perm_cont_calls = 0;
|
|
||||||
uint64_t graph_main_node_v_perm_cont_us = 0;
|
|
||||||
uint64_t graph_main_node_flash_attn_calls = 0;
|
|
||||||
uint64_t graph_main_node_flash_attn_us = 0;
|
|
||||||
uint64_t graph_main_node_attn_out_calls = 0;
|
|
||||||
uint64_t graph_main_node_attn_out_us = 0;
|
|
||||||
uint64_t graph_main_node_ffn_calls = 0;
|
|
||||||
uint64_t graph_main_node_ffn_us = 0;
|
|
||||||
uint64_t graph_main_node_result_rows_calls = 0;
|
|
||||||
uint64_t graph_main_node_result_rows_us = 0;
|
|
||||||
uint64_t graph_main_node_result_norm_calls = 0;
|
|
||||||
uint64_t graph_main_node_result_norm_us = 0;
|
|
||||||
uint64_t graph_main_node_result_calls = 0;
|
|
||||||
uint64_t graph_main_node_result_us = 0;
|
|
||||||
uint64_t graph_feature_bytes = 0;
|
|
||||||
uint64_t graph_pos_bytes = 0;
|
|
||||||
uint64_t graph_mask_bytes = 0;
|
|
||||||
uint64_t graph_visible_kv_sum = 0;
|
|
||||||
uint64_t graph_visible_kv_max = 0;
|
|
||||||
uint64_t graph_pos_fallbacks = 0;
|
|
||||||
uint64_t graph_pos_non_monotonic = 0;
|
|
||||||
uint64_t graph_shape_failures = 0;
|
|
||||||
uint64_t graph_mask_overflow = 0;
|
|
||||||
|
|
||||||
int32_t last_n_rows = 0;
|
|
||||||
int32_t last_width = 0;
|
|
||||||
int32_t last_cross_ctx = 0;
|
|
||||||
int32_t last_left_pad = 0;
|
|
||||||
int32_t last_n_tokens = 0;
|
|
||||||
int32_t last_n_kv_total = 0;
|
|
||||||
int32_t last_kv_cache_host_layers = 0;
|
|
||||||
int32_t capture_prompt_last_rows = 0;
|
|
||||||
int32_t capture_prompt_last_width = 0;
|
|
||||||
int32_t capture_verify_last_rows = 0;
|
|
||||||
int32_t capture_verify_last_width = 0;
|
|
||||||
llama_pos last_pos_first = -1;
|
|
||||||
llama_pos last_pos_last = -1;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_dflash_window_update {
|
struct llama_dflash_window_update {
|
||||||
uint64_t version = 0;
|
uint64_t version = 0;
|
||||||
int32_t keep_rows = 0;
|
int32_t keep_rows = 0;
|
||||||
@ -216,11 +75,9 @@ llama_dflash_kv_cache_transition llama_plan_dflash_kv_cache_transition_for_ctx(
|
|||||||
const llama_dflash_window_update & window_update,
|
const llama_dflash_window_update & window_update,
|
||||||
int32_t n_rows);
|
int32_t n_rows);
|
||||||
|
|
||||||
void llama_dflash_profile_reset(struct llama_context * ctx);
|
|
||||||
void llama_reset_dflash_kv_cache_state(struct llama_context * ctx);
|
void llama_reset_dflash_kv_cache_state(struct llama_context * ctx);
|
||||||
void llama_set_dflash_visible_cross_ctx(struct llama_context * ctx, int32_t cross_ctx);
|
void llama_set_dflash_visible_cross_ctx(struct llama_context * ctx, int32_t cross_ctx);
|
||||||
int32_t llama_get_dflash_visible_cross_ctx(const struct llama_context * ctx);
|
int32_t llama_get_dflash_visible_cross_ctx(const struct llama_context * ctx);
|
||||||
bool llama_dflash_profile_get_stats(const struct llama_context * ctx, llama_dflash_profile_stats * stats);
|
|
||||||
|
|
||||||
int32_t llama_model_dflash_block_size(const struct llama_model * model);
|
int32_t llama_model_dflash_block_size(const struct llama_model * model);
|
||||||
int32_t llama_model_dflash_mask_token_id(const struct llama_model * model);
|
int32_t llama_model_dflash_mask_token_id(const struct llama_model * model);
|
||||||
@ -277,13 +134,3 @@ bool llama_spec_copy_dflash_rows_from_output_indices(
|
|||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const std::vector<int32_t> & output_indices,
|
const std::vector<int32_t> & output_indices,
|
||||||
std::vector<float> & hidden_rows);
|
std::vector<float> & hidden_rows);
|
||||||
|
|
||||||
void llama_dflash_contract_log_accept(
|
|
||||||
int slot_id,
|
|
||||||
bool is_dflash,
|
|
||||||
const char * path,
|
|
||||||
bool any_rejected,
|
|
||||||
size_t n_draft,
|
|
||||||
size_t n_accepted,
|
|
||||||
llama_pos pos_base,
|
|
||||||
const std::vector<int32_t> & output_indices);
|
|
||||||
|
|||||||
117
src/llama.cpp
117
src/llama.cpp
@ -19,7 +19,6 @@
|
|||||||
#include "llama-context.h"
|
#include "llama-context.h"
|
||||||
#include "llama-spec-features.h"
|
#include "llama-spec-features.h"
|
||||||
#include "llama-dflash.h"
|
#include "llama-dflash.h"
|
||||||
#include "llama-dflash-profile.h"
|
|
||||||
#include "llama-quantize.h"
|
#include "llama-quantize.h"
|
||||||
|
|
||||||
#include "unicode.h"
|
#include "unicode.h"
|
||||||
@ -697,8 +696,8 @@ void llama_context::set_mtp_op_type(llama_mtp_op_type value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_context::~llama_context() {
|
llama_context::~llama_context() {
|
||||||
if (dflash_sched != nullptr) {
|
if (dflash.kv.cache_sched != nullptr) {
|
||||||
ggml_backend_sched_free(dflash_sched);
|
ggml_backend_sched_free(dflash.kv.cache_sched);
|
||||||
}
|
}
|
||||||
free_dflash_kv_cache_tensors();
|
free_dflash_kv_cache_tensors();
|
||||||
ggml_backend_sched_free(sched);
|
ggml_backend_sched_free(sched);
|
||||||
@ -5096,10 +5095,6 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
lctx.n_queued_tokens += n_tokens_all;
|
lctx.n_queued_tokens += n_tokens_all;
|
||||||
|
|
||||||
auto * dflash_profile = lctx.model.arch == LLM_ARCH_DFLASH_DRAFT ? &lctx.dflash_profile : nullptr;
|
|
||||||
const bool dflash_decode_timing = dflash_profile != nullptr && llama_env_flag_enabled("IK_DFLASH_DECODE_TIMING");
|
|
||||||
const bool dflash_draft_node_timing = dflash_profile != nullptr && llama_env_flag_enabled("IK_DFLASH_DRAFT_NODE_TIMING");
|
|
||||||
|
|
||||||
auto & kv_self = lctx.kv_self;
|
auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
@ -5139,20 +5134,7 @@ static int llama_decode_internal(
|
|||||||
n_outputs_embd = has_mtp && cparams.mtp_op_type == MTP_OP_NONE ? n_tokens_all : n_outputs;
|
n_outputs_embd = has_mtp && cparams.mtp_op_type == MTP_OP_NONE ? n_tokens_all : n_outputs;
|
||||||
const size_t required_outputs = std::max<size_t>(n_outputs, n_outputs_embd);
|
const size_t required_outputs = std::max<size_t>(n_outputs, n_outputs_embd);
|
||||||
const bool is_dflash_decode = lctx.model.arch == LLM_ARCH_DFLASH_DRAFT;
|
const bool is_dflash_decode = lctx.model.arch == LLM_ARCH_DFLASH_DRAFT;
|
||||||
const size_t output_buf_size_before = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
|
||||||
const int64_t t_output_reserve_us = is_dflash_decode ? ggml_time_us() : 0;
|
|
||||||
const size_t reserved_outputs = llama_output_reserve(lctx, required_outputs);
|
const size_t reserved_outputs = llama_output_reserve(lctx, required_outputs);
|
||||||
if (is_dflash_decode) {
|
|
||||||
auto & profile = lctx.dflash_profile;
|
|
||||||
profile.decode_output_reserve_calls++;
|
|
||||||
profile.decode_output_reserve_us += (uint64_t) (ggml_time_us() - t_output_reserve_us);
|
|
||||||
|
|
||||||
const size_t output_buf_size_after = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0;
|
|
||||||
if (output_buf_size_after > output_buf_size_before) {
|
|
||||||
profile.decode_output_reserve_reallocs++;
|
|
||||||
profile.decode_output_reserve_realloc_bytes += (uint64_t) output_buf_size_after;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (reserved_outputs < required_outputs) {
|
if (reserved_outputs < required_outputs) {
|
||||||
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %zu outputs\n", __func__, required_outputs);
|
LLAMA_LOG_ERROR("%s: could not reserve space for batch with %zu outputs\n", __func__, required_outputs);
|
||||||
return -2;
|
return -2;
|
||||||
@ -5184,10 +5166,6 @@ static int llama_decode_internal(
|
|||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
auto tim1 = ggml_time_us();
|
auto tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_prelude_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_internal_chunks++;
|
|
||||||
}
|
|
||||||
uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token);
|
||||||
if (llm_arch_is_hybrid(model.arch) &&
|
if (llm_arch_is_hybrid(model.arch) &&
|
||||||
n_tokens > 1 &&
|
n_tokens > 1 &&
|
||||||
@ -5353,55 +5331,36 @@ static int llama_decode_internal(
|
|||||||
auto tim2 = ggml_time_us();
|
auto tim2 = ggml_time_us();
|
||||||
printf("prelude(...): %d us\n", int(tim2-tim1));
|
printf("prelude(...): %d us\n", int(tim2-tim1));
|
||||||
#endif
|
#endif
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_prelude_us += (uint64_t) (ggml_time_us() - t_dflash_prelude_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
auto & prev = cparams.mtp_op_type == MTP_OP_NONE ? lctx.prev : lctx.prev_mtp;
|
auto & prev = cparams.mtp_op_type == MTP_OP_NONE ? lctx.prev : lctx.prev_mtp;
|
||||||
ggml_cgraph * gf = nullptr;
|
ggml_cgraph * gf = nullptr;
|
||||||
if (!lctx.can_reuse_graph(u_batch)) {
|
if (!lctx.can_reuse_graph(u_batch)) {
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_graph_rebuilds++;
|
|
||||||
}
|
|
||||||
const int64_t t_dflash_sched_reset_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
lctx.reset_scheduler();
|
lctx.reset_scheduler();
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("sched_reset(...): %d us\n", int(tim2-tim1));
|
printf("sched_reset(...): %d us\n", int(tim2-tim1));
|
||||||
#endif
|
#endif
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_sched_reset_us += (uint64_t) (ggml_time_us() - t_dflash_sched_reset_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_build_graph_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
gf = llm_build_context::llama_build_graph(lctx, u_batch, false);
|
gf = llm_build_context::llama_build_graph(lctx, u_batch, false);
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("build_graph(...): %d us\n", int(tim2-tim1));
|
printf("build_graph(...): %d us\n", int(tim2-tim1));
|
||||||
#endif
|
#endif
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_build_graph_us += (uint64_t) (ggml_time_us() - t_dflash_build_graph_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_sched_alloc_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
ggml_backend_sched_alloc_graph(lctx.sched, gf);
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("sched_alloc_graph(...): %d us\n", int(tim2-tim1));
|
printf("sched_alloc_graph(...): %d us\n", int(tim2-tim1));
|
||||||
#endif
|
#endif
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_sched_alloc_graph_us += (uint64_t) (ggml_time_us() - t_dflash_sched_alloc_us);
|
|
||||||
}
|
|
||||||
//if (u_batch.n_tokens == 1 && u_batch.embd == nullptr && lctx.cparams.graph_reuse) {
|
//if (u_batch.n_tokens == 1 && u_batch.embd == nullptr && lctx.cparams.graph_reuse) {
|
||||||
if (u_batch.embd == nullptr && lctx.cparams.graph_reuse &&
|
if (u_batch.embd == nullptr && lctx.cparams.graph_reuse &&
|
||||||
!((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)) {
|
!((lctx.model.arch == LLM_ARCH_GEMMA4_MTP || lctx.model.arch == LLM_ARCH_GEMMA4_ASSISTANT) && lctx.mtp_target_ctx != nullptr)) {
|
||||||
@ -5422,15 +5381,8 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dflash_profile != nullptr) {
|
if (is_dflash_decode && !llama_prepare_dflash_graph_inputs(lctx, n_tokens)) {
|
||||||
dflash_profile->decode_prepare_calls++;
|
return GGML_STATUS_FAILED;
|
||||||
const int64_t t_prepare_dflash_us = ggml_time_us();
|
|
||||||
if (!llama_prepare_dflash_graph_inputs(lctx, n_tokens)) {
|
|
||||||
dflash_profile->decode_prepare_failures++;
|
|
||||||
dflash_profile->decode_prepare_us += (uint64_t) (ggml_time_us() - t_prepare_dflash_us);
|
|
||||||
return GGML_STATUS_FAILED;
|
|
||||||
}
|
|
||||||
dflash_profile->decode_prepare_us += (uint64_t) (ggml_time_us() - t_prepare_dflash_us);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// the output is always the last tensor in the graph
|
// the output is always the last tensor in the graph
|
||||||
@ -5438,7 +5390,7 @@ static int llama_decode_internal(
|
|||||||
struct ggml_tensor * embd = nullptr;
|
struct ggml_tensor * embd = nullptr;
|
||||||
|
|
||||||
// DFlash GPU argmax draft_argmax node
|
// DFlash GPU argmax draft_argmax node
|
||||||
if (lctx.dflash_draft_tokens_tensor != nullptr &&
|
if (lctx.dflash.draft_tokens_tensor != nullptr &&
|
||||||
strcmp(res->name, "result_output") != 0) {
|
strcmp(res->name, "result_output") != 0) {
|
||||||
for (int i = gf->n_nodes - 2; i >= 0; --i) {
|
for (int i = gf->n_nodes - 2; i >= 0; --i) {
|
||||||
if (strcmp(gf->nodes[i]->name, "result_output") == 0) {
|
if (strcmp(gf->nodes[i]->name, "result_output") == 0) {
|
||||||
@ -5489,39 +5441,18 @@ static int llama_decode_internal(
|
|||||||
#if IK_PRINT_TIMING == 1
|
#if IK_PRINT_TIMING == 1
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_set_inputs_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
llama_set_inputs(lctx, u_batch);
|
llama_set_inputs(lctx, u_batch);
|
||||||
#if IK_PRINT_TIMING == 1
|
#if IK_PRINT_TIMING == 1
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("set_inputs(...): %d us\n", int(tim2-tim1));
|
printf("set_inputs(...): %d us\n", int(tim2-tim1));
|
||||||
#endif
|
#endif
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_set_inputs_us += (uint64_t) (ggml_time_us() - t_dflash_set_inputs_us);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
if (lctx.dflash_kv_workspace_sync_pending) {
|
if (lctx.dflash.kv.workspace_sync_pending) {
|
||||||
llama_sync_dflash_workspace_if_pending(lctx);
|
llama_sync_dflash_workspace_if_pending(lctx);
|
||||||
}
|
}
|
||||||
const int64_t t_dflash_graph_compute_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
llama_dflash_main_node_profiler draft_node_profiler;
|
|
||||||
if (dflash_draft_node_timing) {
|
|
||||||
draft_node_profiler.profile = dflash_profile;
|
|
||||||
draft_node_profiler.prev_callback = lctx.cparams.cb_eval;
|
|
||||||
draft_node_profiler.prev_user_data = lctx.cparams.cb_eval_user_data;
|
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, llama_dflash_main_node_eval_callback, &draft_node_profiler);
|
|
||||||
}
|
|
||||||
llama_graph_compute(lctx, gf, n_threads);
|
llama_graph_compute(lctx, gf, n_threads);
|
||||||
if (dflash_draft_node_timing) {
|
|
||||||
ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
|
|
||||||
}
|
|
||||||
if (dflash_decode_timing) {
|
|
||||||
llama_synchronize(&lctx);
|
|
||||||
dflash_profile->decode_sync_profile_points++;
|
|
||||||
dflash_profile->decode_graph_compute_us += (uint64_t) (ggml_time_us() - t_dflash_graph_compute_us);
|
|
||||||
}
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
llama_synchronize(&lctx);
|
llama_synchronize(&lctx);
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
@ -5547,16 +5478,16 @@ static int llama_decode_internal(
|
|||||||
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
// ggml_graph_dump_dot(gf, NULL, "llama.dot");
|
||||||
//}
|
//}
|
||||||
|
|
||||||
lctx.dflash_draft_tokens.clear();
|
lctx.dflash.draft_tokens.clear();
|
||||||
if (lctx.dflash_draft_tokens_tensor != nullptr) {
|
if (lctx.dflash.draft_tokens_tensor != nullptr) {
|
||||||
ggml_backend_t backend_argmax = ggml_backend_sched_get_tensor_backend(
|
ggml_backend_t backend_argmax = ggml_backend_sched_get_tensor_backend(
|
||||||
lctx.sched, lctx.dflash_draft_tokens_tensor);
|
lctx.sched, lctx.dflash.draft_tokens_tensor);
|
||||||
if (backend_argmax != nullptr) {
|
if (backend_argmax != nullptr) {
|
||||||
const int64_t n_tokens_argmax = lctx.dflash_draft_tokens_tensor->ne[0];
|
const int64_t n_tokens_argmax = lctx.dflash.draft_tokens_tensor->ne[0];
|
||||||
lctx.dflash_draft_tokens.resize((size_t) n_tokens_argmax);
|
lctx.dflash.draft_tokens.resize((size_t) n_tokens_argmax);
|
||||||
ggml_backend_tensor_get_async(backend_argmax,
|
ggml_backend_tensor_get_async(backend_argmax,
|
||||||
lctx.dflash_draft_tokens_tensor,
|
lctx.dflash.draft_tokens_tensor,
|
||||||
lctx.dflash_draft_tokens.data(), 0,
|
lctx.dflash.draft_tokens.data(), 0,
|
||||||
(size_t) n_tokens_argmax * sizeof(int32_t));
|
(size_t) n_tokens_argmax * sizeof(int32_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5564,7 +5495,7 @@ static int llama_decode_internal(
|
|||||||
// extract logits
|
// extract logits
|
||||||
{
|
{
|
||||||
const bool dflash_skip_logits = (lctx.model.arch == LLM_ARCH_DFLASH_DRAFT
|
const bool dflash_skip_logits = (lctx.model.arch == LLM_ARCH_DFLASH_DRAFT
|
||||||
&& !lctx.dflash_draft_tokens.empty());
|
&& !lctx.dflash.draft_tokens.empty());
|
||||||
if (dflash_skip_logits) {
|
if (dflash_skip_logits) {
|
||||||
res = nullptr;
|
res = nullptr;
|
||||||
}
|
}
|
||||||
@ -5573,7 +5504,6 @@ static int llama_decode_internal(
|
|||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_get_result_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
// Do not process logits if MTP is only updating the KV cache.
|
// Do not process logits if MTP is only updating the KV cache.
|
||||||
if (cparams.mtp_op_type != MTP_OP_WARMUP) { // && cparams.mtp_op_type != MTP_OP_UPDATE_ACCEPTED) {
|
if (cparams.mtp_op_type != MTP_OP_WARMUP) { // && cparams.mtp_op_type != MTP_OP_UPDATE_ACCEPTED) {
|
||||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res);
|
||||||
@ -5604,11 +5534,6 @@ static int llama_decode_internal(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dflash_decode_timing) {
|
|
||||||
llama_synchronize(&lctx);
|
|
||||||
dflash_profile->decode_sync_profile_points++;
|
|
||||||
dflash_profile->decode_result_us += (uint64_t) (ggml_time_us() - t_dflash_get_result_us);
|
|
||||||
}
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("get_result(...): %d us\n", int(tim2-tim1));
|
printf("get_result(...): %d us\n", int(tim2-tim1));
|
||||||
@ -5621,7 +5546,6 @@ static int llama_decode_internal(
|
|||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim1 = ggml_time_us();
|
tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_get_embedding_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd);
|
||||||
GGML_ASSERT(backend_embd != nullptr);
|
GGML_ASSERT(backend_embd != nullptr);
|
||||||
|
|
||||||
@ -5661,11 +5585,6 @@ static int llama_decode_internal(
|
|||||||
GGML_ABORT("unknown pooling type");
|
GGML_ABORT("unknown pooling type");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dflash_decode_timing) {
|
|
||||||
llama_synchronize(&lctx);
|
|
||||||
dflash_profile->decode_sync_profile_points++;
|
|
||||||
dflash_profile->decode_embedding_us += (uint64_t) (ggml_time_us() - t_dflash_get_embedding_us);
|
|
||||||
}
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
tim2 = ggml_time_us();
|
tim2 = ggml_time_us();
|
||||||
printf("get_embedding(...): %d us\n", int(tim2-tim1));
|
printf("get_embedding(...): %d us\n", int(tim2-tim1));
|
||||||
@ -5709,13 +5628,9 @@ static int llama_decode_internal(
|
|||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
auto tim1 = ggml_time_us();
|
auto tim1 = ggml_time_us();
|
||||||
#endif
|
#endif
|
||||||
const int64_t t_dflash_final_sched_reset_us = dflash_decode_timing ? ggml_time_us() : 0;
|
|
||||||
if (!lctx.prev) {
|
if (!lctx.prev) {
|
||||||
lctx.reset_scheduler();
|
lctx.reset_scheduler();
|
||||||
}
|
}
|
||||||
if (dflash_decode_timing) {
|
|
||||||
dflash_profile->decode_final_sched_reset_us += (uint64_t) (ggml_time_us() - t_dflash_final_sched_reset_us);
|
|
||||||
}
|
|
||||||
#if IK_PRINT_TIMING
|
#if IK_PRINT_TIMING
|
||||||
auto tim2 = ggml_time_us();
|
auto tim2 = ggml_time_us();
|
||||||
printf("sched_reset(...): %d us\n", int(tim2-tim1));
|
printf("sched_reset(...): %d us\n", int(tim2-tim1));
|
||||||
@ -9838,10 +9753,10 @@ float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_get_dflash_draft_token_ith(struct llama_context * ctx, int32_t i) {
|
llama_token llama_get_dflash_draft_token_ith(struct llama_context * ctx, int32_t i) {
|
||||||
if ((size_t) i >= ctx->dflash_draft_tokens.size()) {
|
if ((size_t) i >= ctx->dflash.draft_tokens.size()) {
|
||||||
return LLAMA_TOKEN_NULL;
|
return LLAMA_TOKEN_NULL;
|
||||||
}
|
}
|
||||||
return ctx->dflash_draft_tokens[(size_t) i];
|
return ctx->dflash.draft_tokens[(size_t) i];
|
||||||
}
|
}
|
||||||
|
|
||||||
float * llama_get_embeddings(struct llama_context * ctx) {
|
float * llama_get_embeddings(struct llama_context * ctx) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user