mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales, but it reveals some weirdness in the current algorithm.
This commit is contained in:
parent
89dc3b254c
commit
961e293833
@ -265,7 +265,10 @@ class Model:
|
|||||||
break
|
break
|
||||||
|
|
||||||
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)):
|
||||||
data: np.ndarray = data # type hint
|
data: np.ndarray # type hint
|
||||||
|
if len(data.shape) == 0:
|
||||||
|
# otherwise single-value tensors get squeezed
|
||||||
|
data = data.reshape((1,))
|
||||||
n_dims = len(data.shape)
|
n_dims = len(data.shape)
|
||||||
data_dtype = data.dtype
|
data_dtype = data.dtype
|
||||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||||
@ -336,7 +339,7 @@ class Model:
|
|||||||
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
shape_str = f"{{{', '.join(str(n) for n in reversed(shape)) or '1'}}}"
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
|
|
||||||
# n_dims is implicit in the shape
|
# n_dims is implicit in the shape
|
||||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
@ -1446,12 +1449,13 @@ class BitnetModel(Model):
|
|||||||
def weight_quant(self, weight):
|
def weight_quant(self, weight):
|
||||||
dtype = weight.dtype
|
dtype = weight.dtype
|
||||||
weight = weight.float()
|
weight = weight.float()
|
||||||
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
scale = weight.abs().mean().clamp(min=1e-5)
|
||||||
weight = (weight * s).round().clamp(-1, 1) / s
|
iscale = 1 / scale
|
||||||
scale = weight.abs().max().unsqueeze(0)
|
weight = (weight * iscale).round().clamp(-1, 1)
|
||||||
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
# TODO: use the scale directly instead of inverting it twice
|
||||||
weight = torch.sign(weight).type(dtype)
|
# (this is also unnecessarily doubly inverted upstream)
|
||||||
return weight.type(dtype), scale.type(torch.float32)
|
# ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10
|
||||||
|
return weight.type(dtype), (1 / iscale).type(torch.float32)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
|
Loading…
Reference in New Issue
Block a user