diff --git a/ggml/src/ggml-cuda/delta-net.cu b/ggml/src/ggml-cuda/delta-net.cu index 74576ad8..dddbf446 100644 --- a/ggml/src/ggml-cuda/delta-net.cu +++ b/ggml/src/ggml-cuda/delta-net.cu @@ -245,7 +245,7 @@ void ggml_cuda_op_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const ggml_tensor * src3 = dst->src[3]; // g const ggml_tensor * src4 = dst->src[4]; // beta const ggml_tensor * src5 = dst->src[5]; // state - const ggml_tensor * src6 = dst->src[6]; // state + const ggml_tensor * src6 = dst->src[6]; // when not null, state for token 0...n_token-1 GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -275,13 +275,11 @@ void ggml_cuda_op_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst) const int64_t state_size = head_dim * head_dim * n_heads * n_seqs; int repeat_type = dst->op_params[0]; - //int save_all_states = dst->op_params[1]; if (src6) { GGML_ASSERT(src6->type == GGML_TYPE_F32); GGML_ASSERT(src6->ne[0] >= (n_tokens - 1)*state_size); } - //const int64_t expected_size = save_all_states ? (output_size + n_tokens * state_size) : (output_size + state_size); const int64_t expected_size = output_size + state_size; GGML_ASSERT(ggml_nelements(dst) == expected_size); diff --git a/src/llama-delta-net.cpp b/src/llama-delta-net.cpp index 1ad79faf..e8996686 100644 --- a/src/llama-delta-net.cpp +++ b/src/llama-delta-net.cpp @@ -80,8 +80,7 @@ std::pair delta_net::build_fused_delta_net(ggml_co ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state, int il, const llm_build_cb & cb, int repeat_type, - bool save_all_steps, - ggml_cgraph * gf, ggml_tensor * per_step_ckpt) { + ggml_tensor * per_step_ckpt) { const int64_t S_k = q->ne[0]; const int64_t H_k = q->ne[2]; @@ -137,13 +136,6 @@ std::pair delta_net::build_fused_delta_net(ggml_co ggml_row_size(fused_result->type, S_v * H_v * n_tokens), 0); //output_tokens = ggml_cont_4d(ctx0, output_tokens, S_v, H_v, n_tokens, n_seqs); - // per-step states are at [output_size, output_size + n_tokens*state_size) - //const int64_t last_state_offset = save_all_steps - // ? (output_size + (n_tokens - 1) * state_size) - // : output_size; - - //ggml_tensor * new_state_flat = ggml_view_1d(ctx0, fused_result, state_size, - // last_state_offset * ggml_element_size(fused_result)); ggml_tensor * new_state_flat = ggml_view_1d(ctx0, fused_result, state_size, output_size * ggml_element_size(fused_result)); ggml_tensor * new_state = ggml_reshape_4d(ctx0, new_state_flat, S_v, S_v, H_v, n_seqs); @@ -151,22 +143,6 @@ std::pair delta_net::build_fused_delta_net(ggml_co cb(output_tokens, "output_tokens", il); cb(new_state, "new_state", il); - // Copy all per-step SSM states to persistent checkpoint tensor - //if (save_all_steps && per_step_ckpt != nullptr && gf != nullptr && n_tokens > 1) { - // const int64_t per_step_total = n_tokens * state_size; - // if (per_step_total <= ggml_nelements(per_step_ckpt)) { - // ggml_tensor * all_steps_src = ggml_view_1d(ctx0, fused_result, per_step_total, - // output_size * ggml_element_size(fused_result)); - // ggml_tensor * ckpt_dst = ggml_view_1d(ctx0, per_step_ckpt, per_step_total, 0); - // auto ckpt_cpy = ggml_cpy(ctx0, all_steps_src, ckpt_dst); - // cb(ckpt_cpy, "per_step_ckpt_cpy", il); - // ggml_build_forward_expand(gf, ckpt_cpy); - // } else { - // LLAMA_LOG_WARN("%s: per-step checkpoint tensor too small for %lld tokens (need %lld, have %lld), skipping per-step save\n", - // __func__, (long long)n_tokens, (long long)per_step_total, (long long)ggml_nelements(per_step_ckpt)); - // } - //} - return {output_tokens, new_state}; } @@ -309,7 +285,7 @@ ggml_tensor * delta_net::build_qkv(ggml_context * ctx0, ggml_tensor * state_stor int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, int64_t ssm_d_conv, int64_t state_seq_id_local, uint32_t qnext_state_slots, bool reset_state_local, float eps_norm, int repeat_type, int il, const llm_build_cb & cb, ggml_cgraph * gf, - bool save_per_step_states, ggml_tensor * per_step_ckpt) { + ggml_tensor * per_step_ckpt) { const int64_t key_dim = head_k_dim * num_k_heads; const int64_t value_dim = head_v_dim * num_v_heads; const int64_t conv_dim = key_dim * 2 + value_dim; @@ -395,7 +371,7 @@ ggml_tensor * delta_net::build_qkv(ggml_context * ctx0, ggml_tensor * state_stor cb(k_conv, "k_conv_normed", il); auto [output, new_state] = build_fused_delta_net(ctx0, q_conv, k_conv, v_conv, gate, beta, state, il, cb, repeat_type, - save_per_step_states, gf, per_step_ckpt); + per_step_ckpt); cb(output, "attn_output", il); cb(new_state, "new_state", il); @@ -559,7 +535,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ auto output = build_qkv(ctx0, split_s_l->splits[id], split_ssm_conv1d->splits[id], qkv_mixed, inp_s_seq_qnext, beta, gate, head_k_dim, num_k_heads_id, head_v_dim, num_v_heads_id, hparams.ssm_d_conv, state_seq_id_local, qnext_state_slots, reset_state_local, hparams.f_norm_rms_eps, - l.ssm_beta_alpha ? 0 : 1, il, cb, gf, save_per_step_states, per_step_ckpt); + l.ssm_beta_alpha ? 0 : 1, il, cb, gf, per_step_ckpt); split_norm = (ggml_split_tensor_t *)l.ssm_norm->extra; GGML_ASSERT(split_norm && split_norm->splits[id]); auto split_ssm_out = (ggml_split_tensor_t *)l.ssm_out->extra; @@ -631,8 +607,7 @@ ggml_tensor * delta_net::build_layer_attn_linear_core(ggml_context * ctx0, ggml_ qkv_mixed, inp_s_seq_qnext, beta, gate, head_k_dim, num_k_heads, head_v_dim, num_v_heads, hparams.ssm_d_conv, state_seq_id_local, qnext_state_slots, reset_state_local, hparams.f_norm_rms_eps, - model.layers[il].ssm_beta_alpha ? 0 : 1, il, cb, gf, - save_per_step_states, per_step_ckpt); + model.layers[il].ssm_beta_alpha ? 0 : 1, il, cb, gf, per_step_ckpt); auto gated_output = build_gated_output(lctx, ctx0, model.layers[il].ssm_norm, model.layers[il].ssm_out, output, z, head_v_dim, num_v_heads, n_tok, il, cb); if (inp_out_ids) { diff --git a/src/llama-delta-net.h b/src/llama-delta-net.h index 2fe499ad..9ac48525 100644 --- a/src/llama-delta-net.h +++ b/src/llama-delta-net.h @@ -15,8 +15,7 @@ struct delta_net { ggml_tensor * q, ggml_tensor * k, ggml_tensor * v, ggml_tensor * g, ggml_tensor * beta, ggml_tensor * state, int il, const llm_build_cb & cb, int repeat_type, - bool save_all_steps = false, - ggml_cgraph * gf = nullptr, ggml_tensor * per_step_ckpt = nullptr); + ggml_tensor * per_step_ckpt = nullptr); ggml_tensor * build_layer_attn_linear_core(ggml_context * ctx0, ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * inp_s_seq_qnext, ggml_tensor * inp_out_ids, @@ -52,7 +51,7 @@ private: int64_t head_k_dim, int64_t num_k_heads, int64_t head_v_dim, int64_t num_v_heads, int64_t ssm_d_conv, int64_t state_seq_id_local, uint32_t qnext_state_slots, bool reset_state_local, float eps_norm, int repeat_type, int il, const llm_build_cb & cb, ggml_cgraph * gf, - bool save_per_step_states = false, ggml_tensor * per_step_ckpt = nullptr); + ggml_tensor * per_step_ckpt = nullptr); static ggml_tensor * build_gated_output(llama_context & lctx, ggml_context * ctx0, ggml_tensor * ssm_norm, ggml_tensor * ssm_out, ggml_tensor * output, ggml_tensor * z, int64_t head_v_dim, int64_t num_v_heads, int64_t n_tok, int il, const llm_build_cb & cb);