Removed the 3D views of wk_b and wv_b, and just save and 3D in GGUF

This commit is contained in:
juk 2025-04-12 20:26:24 +01:00
parent 5d037ae935
commit 638b092d7a
2 changed files with 4 additions and 20 deletions

View File

@ -4523,8 +4523,6 @@ class DeepseekV2Model(Model):
kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1])
k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1)
k_b = k_b.transpose(1, 2)
k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim)
v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1])
return [
(self.map_tensor_name(name_kb), k_b),

View File

@ -3249,8 +3249,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
// note: only old legacy GGUF files will have the unsplit wkv_b tensor in
if (is_mla) {
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v_mla}, 0);
layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0);
layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0);
} else {
layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0);
}
@ -10143,18 +10143,11 @@ struct llm_build_deepseek2 : public llm_graph_context {
cb(kv_cmpr, "kv_cmpr", il);
if (is_mla) {
ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b,
n_embd_head_qk_nope, kv_lora_rank, n_head,
ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope),
ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope) * kv_lora_rank,
0);
cb(wk_b, "wk_b", il);
// {n_embd_head_qk_nope, n_tokens, n_head}
q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3);
cb(q_nope, "q_nope_perm", il);
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope);
ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
cb(q_nope_absorbed, "q_nope_absorbed", il);
// {n_embd_head_qk_rope, n_tokens, n_head}
@ -10178,17 +10171,10 @@ struct llm_build_deepseek2 : public llm_graph_context {
ggml_tensor * v_states = kv_cmpr;
cb(v_states, "v_states", il);
ggml_tensor * v_mla = ggml_view_3d(ctx0, model.layers[il].wv_b,
kv_lora_rank, n_embd_head_v, n_head,
ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank),
ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank) * n_embd_head_v,
0);
cb(v_mla, "v_mla", il);
// note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
cur = build_attn_mla(inp_attn, gf,
model.layers[il].wo, NULL,
q_states, k_states, v_states, nullptr, v_mla, kq_scale, il);
q_states, k_states, v_states, nullptr, model.layers[il].wv_b, kq_scale, il);
} else {
ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
cb(kv, "kv", il);