mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 11:24:35 +00:00
llama : fix K-shift with quantized K (wip)
This commit is contained in:
parent
7fe4678b02
commit
5271c75666
31
llama.cpp
31
llama.cpp
@ -4671,17 +4671,30 @@ static void llm_build_k_shift(
|
||||
}
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * tmp =
|
||||
// we rotate only the first n_rot dimensions
|
||||
ggml_rope_custom_inplace(ctx,
|
||||
ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
0),
|
||||
struct ggml_tensor * k = ggml_view_3d(ctx, kv.k_l[il],
|
||||
n_embd_head_k, n_head_kv, n_ctx,
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
||||
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
||||
0);
|
||||
|
||||
struct ggml_tensor * tmp;
|
||||
if (ggml_is_quantized(k->type)) {
|
||||
// dequantize to f32 -> RoPE -> quantize back
|
||||
tmp = ggml_cast(ctx, k, GGML_TYPE_F32);
|
||||
cb(tmp, "K_f32", il);
|
||||
tmp = ggml_rope_custom_inplace(ctx, tmp,
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(tmp, "K_shifted", il);
|
||||
cb(tmp, "K_shifted_f32", il);
|
||||
tmp = ggml_cpy(ctx, tmp, k);
|
||||
cb(tmp, "K_shifted_q", il);
|
||||
} else {
|
||||
// we rotate only the first n_rot dimensions
|
||||
tmp = ggml_rope_custom_inplace(ctx, k,
|
||||
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow);
|
||||
cb(tmp, "K_shifted", il);
|
||||
}
|
||||
ggml_build_forward_expand(graph, tmp);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user