mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-26 03:14:35 +00:00
gguf-py : fix and simplify quantized shape round-trip (#7483)
* gguf-py : fix and simplify quantized shape round-trip * gguf-py : remove unused import
This commit is contained in:
parent
d041d2ceaa
commit
b83bab15a5
@ -313,11 +313,10 @@ class Model:
|
|||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
|
||||||
)}}}"""
|
|
||||||
|
|
||||||
# n_dims is implicit in the shape
|
# n_dims is implicit in the shape
|
||||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
|
@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
from .quants import quant_shape_to_byte_shape
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -251,6 +253,7 @@ class GGUFReader:
|
|||||||
tensor_names.add(tensor_name)
|
tensor_names.add(tensor_name)
|
||||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||||
n_elems = int(np.prod(dims))
|
n_elems = int(np.prod(dims))
|
||||||
|
np_dims = tuple(reversed(dims.tolist()))
|
||||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
n_bytes = n_elems * type_size // block_size
|
n_bytes = n_elems * type_size // block_size
|
||||||
data_offs = int(start_offs + offset_tensor[0])
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
@ -279,6 +282,7 @@ class GGUFReader:
|
|||||||
else:
|
else:
|
||||||
item_count = n_bytes
|
item_count = n_bytes
|
||||||
item_type = np.uint8
|
item_type = np.uint8
|
||||||
|
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||||
tensors.append(ReaderTensor(
|
tensors.append(ReaderTensor(
|
||||||
name = tensor_name,
|
name = tensor_name,
|
||||||
tensor_type = ggml_type,
|
tensor_type = ggml_type,
|
||||||
@ -286,7 +290,7 @@ class GGUFReader:
|
|||||||
n_elements = n_elems,
|
n_elements = n_elems,
|
||||||
n_bytes = n_bytes,
|
n_bytes = n_bytes,
|
||||||
data_offset = data_offs,
|
data_offset = data_offs,
|
||||||
data = self._get(data_offs, item_type, item_count),
|
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||||
field = field,
|
field = field,
|
||||||
))
|
))
|
||||||
self.tensors = tensors
|
self.tensors = tensors
|
||||||
|
@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .constants import (
|
from .constants import (
|
||||||
GGML_QUANT_SIZES,
|
|
||||||
GGUF_DEFAULT_ALIGNMENT,
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
GGUF_MAGIC,
|
GGUF_MAGIC,
|
||||||
GGUF_VERSION,
|
GGUF_VERSION,
|
||||||
@ -26,6 +25,8 @@ from .constants import (
|
|||||||
TokenType,
|
TokenType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .quants import quant_shape_from_byte_shape
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -229,10 +230,7 @@ class GGUFWriter:
|
|||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
if tensor_shape[-1] % type_size != 0:
|
|
||||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
|
||||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
|
||||||
n_dims = len(tensor_shape)
|
n_dims = len(tensor_shape)
|
||||||
self.ti_data += self._pack("I", n_dims)
|
self.ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Callable
|
from typing import Callable, Sequence
|
||||||
|
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % block_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||||
|
@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
|||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
total_bytes += tensor.n_bytes
|
total_bytes += tensor.n_bytes
|
||||||
# Dimensions are written in reverse order, so flip them first
|
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||||
shape = np.flipud(tensor.shape).tolist()
|
|
||||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
|
||||||
|
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user