From 961e2938333ce6e1fa723a7be09e984093950864 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 26 Jun 2024 16:24:40 -0400 Subject: [PATCH] convert-hf : simplify BitNet pre-quantization This still results in the exact same tensor weights and scales, but it reveals some weirdness in the current algorithm. --- convert-hf-to-gguf.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ec66316ee..9f4094194 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -265,7 +265,10 @@ class Model: break for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): - data: np.ndarray = data # type hint + data: np.ndarray # type hint + if len(data.shape) == 0: + # otherwise single-value tensors get squeezed + data = data.reshape((1,)) n_dims = len(data.shape) data_dtype = data.dtype data_qtype: gguf.GGMLQuantizationType | None = None @@ -336,7 +339,7 @@ class Model: shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape)) or '1'}}}" + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" # n_dims is implicit in the shape logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") @@ -1446,12 +1449,13 @@ class BitnetModel(Model): def weight_quant(self, weight): dtype = weight.dtype weight = weight.float() - s = 1 / weight.abs().mean().clamp(min=1e-5) - weight = (weight * s).round().clamp(-1, 1) / s - scale = weight.abs().max().unsqueeze(0) - weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype) - weight = torch.sign(weight).type(dtype) - return weight.type(dtype), scale.type(torch.float32) + scale = weight.abs().mean().clamp(min=1e-5) + iscale = 1 / scale + weight = (weight * iscale).round().clamp(-1, 1) + # TODO: use the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 + return weight.type(dtype), (1 / iscale).type(torch.float32) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: new_name = self.map_tensor_name(name)