llama.cpp/gguf-py/gguf/quants.py

from __future__ import annotations
from typing import Callable, Sequence

from numpy.typing import DTypeLike

from .constants import GGML_QUANT_SIZES, GGMLQuantizationType
from .lazy import LazyNumpyTensor

import numpy as np


def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % block_size != 0:
        raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
    return (*shape[:-1], shape[-1] // block_size * type_size)


def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
    block_size, type_size = GGML_QUANT_SIZES[quant_type]
    if shape[-1] % type_size != 0:
        raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
    return (*shape[:-1], shape[-1] // type_size * block_size)


# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
    n = n.astype(np.float32, copy=False).view(np.uint32)
    # force nan to quiet
    n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
    # round to nearest even
    n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
    return n.astype(np.uint16)


# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time
def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:
    rows = arr.reshape((-1, arr.shape[-1]))
    osize = 1
    for dim in oshape:
        osize *= dim
    out = np.empty(shape=osize, dtype=otype)
    # compute over groups of 16 rows (arbitrary, but seems good for performance)
    n_groups = (rows.shape[0] // 16) or 1
    np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)
    return out.reshape(oshape)


def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:
    return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)


__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)


def quantize_bf16(n: np.ndarray):
    if type(n) is LazyNumpyTensor:
        return __quantize_bf16_lazy(n)
    else:
        return __quantize_bf16_array(n)


__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]


def can_quantize_to_q8_0(n: np.ndarray) -> bool:
    return n.shape[-1] % __q8_block_size == 0


# round away from zero
# ref: https://stackoverflow.com/a/59143326/22827863
def np_roundf(n: np.ndarray) -> np.ndarray:
    a = abs(n)
    floored = np.floor(a)
    b = floored + np.floor(2 * (a - floored))
    return np.sign(n) * b


def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:
    return (*s[:-1], s[-1] // __q8_block_size * __q8_type_size)


# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c
def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:
    shape = n.shape
    assert shape[-1] % __q8_block_size == 0

    n_blocks = n.size // __q8_block_size

    blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)

    d = abs(blocks).max(axis=1, keepdims=True) / 127
    with np.errstate(divide="ignore"):
        id = np.where(d == 0, 0, 1 / d)
    qs = np_roundf(blocks * id)

    # (n_blocks, 2)
    d = d.astype(np.float16).view(np.uint8)
    # (n_blocks, block_size)
    qs = qs.astype(np.int8).view(np.uint8)

    assert d.shape[1] + qs.shape[1] == __q8_type_size

    return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))


def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:
    return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))


__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(
    __quantize_q8_0_array,
    meta_noop=(np.uint8, __quantize_q8_0_shape_change),
)


def quantize_q8_0(data: np.ndarray):
    if type(data) is LazyNumpyTensor:
        return __quantize_q8_0_lazy(data)
    else:
        return __quantize_q8_0_array(data)
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00			`from __future__ import annotations`
gguf-py : fix and simplify quantized shape round-trip (#7483) * gguf-py : fix and simplify quantized shape round-trip * gguf-py : remove unused import 2024-05-25 01:11:48 +00:00			`from typing import Callable, Sequence`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00
			`from numpy.typing import DTypeLike`

			`from .constants import GGML_QUANT_SIZES, GGMLQuantizationType`
			`from .lazy import LazyNumpyTensor`

			`import numpy as np`


gguf-py : fix and simplify quantized shape round-trip (#7483) * gguf-py : fix and simplify quantized shape round-trip * gguf-py : remove unused import 2024-05-25 01:11:48 +00:00			`def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):`
			`block_size, type_size = GGML_QUANT_SIZES[quant_type]`
			`if shape[-1] % block_size != 0:`
			`raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")`
			`return (shape[:-1], shape[-1] // block_size type_size)`


			`def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):`
			`block_size, type_size = GGML_QUANT_SIZES[quant_type]`
			`if shape[-1] % type_size != 0:`
			`raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")`
			`return (shape[:-1], shape[-1] // type_size block_size)`


convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00			`# same as ggml_compute_fp32_to_bf16 in ggml-impl.h`
			`def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:`
Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-08-02 19:11:39 +00:00			`n = n.astype(np.float32, copy=False).view(np.uint32)`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00			`# force nan to quiet`
Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-08-02 19:11:39 +00:00			`n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) \| np.uint32(64 << 16), n)`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00			`# round to nearest even`
Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-08-02 19:11:39 +00:00			`n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16`
			`return n.astype(np.uint16)`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00

			`# This is faster than np.vectorize and np.apply_along_axis because it works on more than one row at a time`
			`def __apply_over_grouped_rows(func: Callable[[np.ndarray], np.ndarray], arr: np.ndarray, otype: DTypeLike, oshape: tuple[int, ...]) -> np.ndarray:`
			`rows = arr.reshape((-1, arr.shape[-1]))`
			`osize = 1`
			`for dim in oshape:`
			`osize *= dim`
			`out = np.empty(shape=osize, dtype=otype)`
			`# compute over groups of 16 rows (arbitrary, but seems good for performance)`
Refactor lora adapter support (#8332) * lora: load to devide buft * add patch tensor function * correct tensor patch * llama_lora_adapter_apply * correct ggml_backend_tensor_copy * add llm_build_mm * fix auto merge * update based on review comments * add convert script * no more transpose A * add f16 convert * add metadata check * add sanity check * fix ftype * add requirements * fix requirements * fix outfile * conversion: only allow selected models * fix types * cuda : do not use dmmv if the tensor does not have enough cols * llama : lora fixes * do not disable mmap with lora Co-authored-by: slaren <slarengh@gmail.com> * llm_build_lora_mm_id * convert_lora : MoE LoRA conversion support * convert_lora : prefer safetensors, similarly to convert_hf * convert_hf : simplify modify_tensors for InternLM2 * convert_lora : lazy conversion * llama : load and use alpha from LoRA adapters * llama : use llm_build_lora_mm in most model graphs * auto scale * Revert "auto scale" This reverts commit 42415a4874e0f963e4aca6796ea5dfb97cd17464. * remove redundant params * Apply suggestions from code review Co-authored-by: slaren <slarengh@gmail.com> * change kv metadata * move add_type to __init__ * convert_hf : move add_type to main() * convert_lora : use the GGUFWriter from Model instead of overwriting it --------- Co-authored-by: slaren <slarengh@gmail.com> Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-07-15 18:50:47 +00:00			`n_groups = (rows.shape[0] // 16) or 1`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00			`np.concatenate([func(group).ravel() for group in np.array_split(rows, n_groups)], axis=0, out=out)`
			`return out.reshape(oshape)`


			`def __quantize_bf16_array(n: np.ndarray) -> np.ndarray:`
Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-08-02 19:11:39 +00:00			`return __apply_over_grouped_rows(__compute_fp32_to_bf16, arr=n, otype=np.uint16, oshape=n.shape)`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00

Fix conversion of unnormalized BF16->BF16 weights (#7843) * add truncate_bf16 * truncate intermediate fp32 if converting bf16 to bf16 * fix masking in __compute_fp32_to_bf16 * np.int16 no longer used * missing cast and additional numpy 2.x fix * ggml-impl : do not flush bf16 subnormals to zero * ggml : add reference fp32 to bf16 conversion The fast version is no longer equivalent for all platforms because of the handling of subnormal values. * gguf-py : remove flush to zero for bf16 subnormals * gguf-py : remove float32 truncation to bf16 Rounding achieves the same thing in the cases where this was used. * missed prototype update in merge * merge cleanup --------- Co-authored-by: Francis Couture-Harpin <git@compilade.net> 2024-08-02 19:11:39 +00:00			`__quantize_bf16_lazy = LazyNumpyTensor._wrap_fn(__quantize_bf16_array, meta_noop=np.uint16)`
convert-hf : support direct Q8_0 conversion (#7234) * convert-hf : support q8_0 conversion * convert-hf : add missing ftype This was messing with the checksums otherwise. * convert-hf : add missing ftype to Baichuan and Xverse I didn't notice these on my first pass. 2024-05-13 18:10:51 +00:00

			`def quantize_bf16(n: np.ndarray):`
			`if type(n) is LazyNumpyTensor:`
			`return __quantize_bf16_lazy(n)`
			`else:`
			`return __quantize_bf16_array(n)`


			`__q8_block_size, __q8_type_size = GGML_QUANT_SIZES[GGMLQuantizationType.Q8_0]`


			`def can_quantize_to_q8_0(n: np.ndarray) -> bool:`
			`return n.shape[-1] % __q8_block_size == 0`


			`# round away from zero`
			`# ref: https://stackoverflow.com/a/59143326/22827863`
			`def np_roundf(n: np.ndarray) -> np.ndarray:`
			`a = abs(n)`
			`floored = np.floor(a)`
			`b = floored + np.floor(2 * (a - floored))`
			`return np.sign(n) * b`


			`def __quantize_q8_0_shape_change(s: tuple[int, ...]) -> tuple[int, ...]:`
			`return (s[:-1], s[-1] // __q8_block_size __q8_type_size)`


			`# Implementation of Q8_0 with bit-exact same results as reference implementation in ggml-quants.c`
			`def __quantize_q8_0_rows(n: np.ndarray) -> np.ndarray:`
			`shape = n.shape`
			`assert shape[-1] % __q8_block_size == 0`

			`n_blocks = n.size // __q8_block_size`

			`blocks = n.reshape((n_blocks, __q8_block_size)).astype(np.float32, copy=False)`

			`d = abs(blocks).max(axis=1, keepdims=True) / 127`
			`with np.errstate(divide="ignore"):`
			`id = np.where(d == 0, 0, 1 / d)`
			`qs = np_roundf(blocks * id)`

			`# (n_blocks, 2)`
			`d = d.astype(np.float16).view(np.uint8)`
			`# (n_blocks, block_size)`
			`qs = qs.astype(np.int8).view(np.uint8)`

			`assert d.shape[1] + qs.shape[1] == __q8_type_size`

			`return np.concatenate([d, qs], axis=1).reshape(__quantize_q8_0_shape_change(shape))`


			`def __quantize_q8_0_array(n: np.ndarray) -> np.ndarray:`
			`return __apply_over_grouped_rows(__quantize_q8_0_rows, arr=n, otype=np.uint8, oshape=__quantize_q8_0_shape_change(n.shape))`


			`__quantize_q8_0_lazy = LazyNumpyTensor._wrap_fn(`
			`__quantize_q8_0_array,`
			`meta_noop=(np.uint8, __quantize_q8_0_shape_change),`
			`)`


			`def quantize_q8_0(data: np.ndarray):`
			`if type(data) is LazyNumpyTensor:`
			`return __quantize_q8_0_lazy(data)`
			`else:`
			`return __quantize_q8_0_array(data)`