convert-hf : allow converting the weird BitNet 1.3B

Its FFN size is 5460 which is not convenient. The offending tensors are kept in F16, which makes the final model 5.01 bpw.
2024-12-29 04:44:34 +00:00 · 2024-06-26 22:10:12 -04:00 · 2024-06-26 22:10:12 -04:00 · 0996149911
commit 0996149911
parent 961e293833
2 changed files with 14 additions and 6 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -301,12 +301,16 @@ class Model:
                if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
                    # TODO: cleaner model-specific per-tensor types
                    # NOTE: Q1_3 is only relevant for BitNet 1.58b
-                    if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any(
+                    if (
                        self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3
                        and gguf.can_quantize_to_q1_3(data)
                        and not any(
                            self.match_model_tensor_name(new_name, key, None)
                            for key in [
                                gguf.MODEL_TENSOR.TOKEN_EMBD,
                                gguf.MODEL_TENSOR.OUTPUT,
                            ]
                        )
                    ):
                        data = gguf.quantize_q1_3(data)
                        assert data.dtype == np.uint8
--- a/gguf-py/gguf/quants.py
+++ b/gguf-py/gguf/quants.py
@ -126,6 +126,10 @@ def quantize_q8_0(data: np.ndarray):
 __q1_3_block_size, __q1_3_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q1_3]
 def can_quantize_to_q1_3(n: np.ndarray) -> bool:
    return n.shape[-1] % __q1_3_block_size == 0
 def __quantize_q1_3_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
    return (*s[:-1], s[-1] // __q1_3_block_size * __q1_3_type_size)