mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 19:50:17 +00:00
parent
97c1549808
commit
b43ebde3b0
@ -10,7 +10,7 @@ import re
|
|||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
|
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -487,7 +487,8 @@ class MPTModel(Model):
|
|||||||
# map tensor names
|
# map tensor names
|
||||||
if "scales" in name:
|
if "scales" in name:
|
||||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
||||||
new_name = new_name.replace("scales", "act.scales")
|
if new_name is not None:
|
||||||
|
new_name = new_name.replace("scales", "act.scales")
|
||||||
else:
|
else:
|
||||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||||
if new_name is None:
|
if new_name is None:
|
||||||
@ -904,7 +905,7 @@ class QwenModel(Model):
|
|||||||
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
|
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
||||||
parts = [bytes([b]) for b in token]
|
parts = [bytes([b]) for b in token]
|
||||||
while True:
|
while True:
|
||||||
min_idx = None
|
min_idx = None
|
||||||
@ -1285,7 +1286,7 @@ def main() -> None:
|
|||||||
|
|
||||||
if args.awq_path:
|
if args.awq_path:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||||
from awq.apply_awq import add_scale_weights
|
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
||||||
tmp_model_path = args.model / "weighted_model"
|
tmp_model_path = args.model / "weighted_model"
|
||||||
dir_model = tmp_model_path
|
dir_model = tmp_model_path
|
||||||
if tmp_model_path.is_dir():
|
if tmp_model_path.is_dir():
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
@ -9,7 +10,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
import os
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
|
|||||||
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(
|
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
|
||||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
vocab_factory = convert.VocabFactory(vocab_path)
|
||||||
cfg.vocabtype)
|
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
|
||||||
load_merges = cfg.vocabtype == 'bpe',
|
|
||||||
n_vocab = vocab.vocab_size)
|
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return params, vocab, special_vocab
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
|
@ -5,17 +5,16 @@ import json
|
|||||||
import os
|
import os
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any, BinaryIO, Sequence
|
from typing import Any, BinaryIO, Sequence
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import torch
|
|
||||||
import os
|
|
||||||
from pprint import pprint
|
|
||||||
import sys
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
import torch
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
@ -69,7 +71,7 @@ def main():
|
|||||||
persimmon_model = torch.load(args.ckpt_path)
|
persimmon_model = torch.load(args.ckpt_path)
|
||||||
hparams = persimmon_model['args']
|
hparams = persimmon_model['args']
|
||||||
pprint(hparams)
|
pprint(hparams)
|
||||||
tensors = {}
|
tensors: dict[str, torch.Tensor] = {}
|
||||||
_flatten_dict(persimmon_model['model'], tensors, None)
|
_flatten_dict(persimmon_model['model'], tensors, None)
|
||||||
|
|
||||||
arch = gguf.MODEL_ARCH.PERSIMMON
|
arch = gguf.MODEL_ARCH.PERSIMMON
|
||||||
|
627
convert.py
627
convert.py
@ -17,58 +17,28 @@ import signal
|
|||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import warnings
|
|
||||||
import zipfile
|
import zipfile
|
||||||
from abc import ABCMeta, abstractmethod
|
from abc import ABCMeta, abstractmethod
|
||||||
from argparse import ArgumentParser
|
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import (
|
from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
||||||
IO,
|
|
||||||
TYPE_CHECKING,
|
|
||||||
Any,
|
|
||||||
Callable,
|
|
||||||
Iterable,
|
|
||||||
Literal,
|
|
||||||
Optional,
|
|
||||||
Tuple,
|
|
||||||
TypeVar,
|
|
||||||
)
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
try:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
from transformers import AutoTokenizer
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
except ModuleNotFoundError as e:
|
import gguf
|
||||||
warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")
|
|
||||||
|
|
||||||
# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
|
if TYPE_CHECKING:
|
||||||
if "NO_LOCAL_GGUF" not in os.environ:
|
from typing import TypeAlias
|
||||||
# Use absolute path to the gguf-py directory
|
|
||||||
gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
|
|
||||||
print(gguf_py_dir) # NOTE: Remove this once path is verified after changes are completed
|
|
||||||
if gguf_py_dir not in sys.path:
|
|
||||||
sys.path.insert(1, gguf_py_dir)
|
|
||||||
|
|
||||||
# Import gguf module
|
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||||
try:
|
|
||||||
import gguf
|
|
||||||
except ModuleNotFoundError as e:
|
|
||||||
print(f"Could not import gguf: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if TYPE_CHECKING: # NOTE: This isn't necessary.
|
|
||||||
from typing import TypeAlias # This can technically be omitted.
|
|
||||||
|
|
||||||
if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
|
|
||||||
faulthandler.register(signal.SIGUSR1)
|
faulthandler.register(signal.SIGUSR1)
|
||||||
|
|
||||||
# NOTE: n-dimensional arrays should be directly referenced
|
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||||
NDArray: TypeAlias = "np.ndarray[Any, Any]"
|
|
||||||
|
|
||||||
# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
|
|
||||||
ARCH = gguf.MODEL_ARCH.LLAMA
|
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
@ -78,7 +48,6 @@ DEFAULT_CONCURRENCY = 8
|
|||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
# TODO: Clean up and refactor data types
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
@ -183,85 +152,65 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
n_embd: int
|
n_embd: int
|
||||||
n_layer: int
|
n_layer: int
|
||||||
n_ctx: int
|
n_ctx: int
|
||||||
n_ff: int
|
n_ff: int
|
||||||
n_head: int
|
n_head: int
|
||||||
n_head_kv: int
|
n_head_kv: int
|
||||||
f_norm_eps: Optional[float] = None
|
n_experts: int | None = None
|
||||||
n_experts: Optional[int] = None
|
n_experts_used: int | None = None
|
||||||
n_experts_used: Optional[int] = None
|
f_norm_eps: float | None = None
|
||||||
|
|
||||||
rope_scaling_type: Optional[gguf.RopeScalingType] = None
|
rope_scaling_type: gguf.RopeScalingType | None = None
|
||||||
f_rope_freq_base: Optional[float] = None
|
f_rope_freq_base: float | None = None
|
||||||
f_rope_scale: Optional[float] = None
|
f_rope_scale: float | None = None
|
||||||
n_orig_ctx: Optional[int] = None
|
n_orig_ctx: int | None = None
|
||||||
rope_finetuned: Optional[bool] = None
|
rope_finetuned: bool | None = None
|
||||||
|
|
||||||
ftype: Optional[GGMLFileType] = None
|
ftype: GGMLFileType | None = None
|
||||||
|
|
||||||
# path to the directory containing the model files
|
# path to the directory containing the model files
|
||||||
path_model: Optional[Path] = None
|
path_model: Path | None = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def guessed(model: LazyModel) -> "Params":
|
def guessed(model: LazyModel) -> Params:
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
n_vocab, n_embd = (
|
n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
|
||||||
model["model.embed_tokens.weight"].shape
|
|
||||||
if "model.embed_tokens.weight" in model
|
|
||||||
else model["tok_embeddings.weight"].shape
|
|
||||||
)
|
|
||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer = next(
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
i
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
for i in itertools.count()
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
if f"model.layers.{i}.self_attn.q_proj.weight" not in model
|
|
||||||
)
|
|
||||||
elif (
|
|
||||||
"model.layers.0.self_attn.W_pack.weight" in model
|
|
||||||
): # next: try baichuan naming
|
|
||||||
n_layer = next(
|
|
||||||
i
|
|
||||||
for i in itertools.count()
|
|
||||||
if f"model.layers.{i}.self_attn.W_pack.weight" not in model
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
n_layer = next(
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
i
|
|
||||||
for i in itertools.count()
|
|
||||||
if f"layers.{i}.attention.wq.weight" not in model
|
|
||||||
)
|
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception(
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
"failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
|
|
||||||
)
|
|
||||||
|
|
||||||
n_head = n_embd // 128 # guessed
|
n_head = n_embd // 128 # guessed
|
||||||
n_mult = 256 # guessed
|
n_mult = 256 # guessed
|
||||||
|
|
||||||
# TODO: verify this
|
# TODO: verify this
|
||||||
n_ff = int(2 * (4 * n_embd) / 3)
|
n_ff = int(2 * (4 * n_embd) / 3)
|
||||||
n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
|
n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd=n_embd,
|
n_embd = n_embd,
|
||||||
n_layer=n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx=-1,
|
n_ctx = -1,
|
||||||
n_ff=n_ff,
|
n_ff = n_ff,
|
||||||
n_head=n_head,
|
n_head = n_head,
|
||||||
n_head_kv=n_head,
|
n_head_kv = n_head,
|
||||||
f_norm_eps=1e-5,
|
f_norm_eps = 1e-5,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
|
def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
|
||||||
@ -274,22 +223,20 @@ class Params:
|
|||||||
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
rope_scaling_type = gguf.RopeScalingType.LINEAR
|
||||||
elif typ == "yarn":
|
elif typ == "yarn":
|
||||||
rope_scaling_type = gguf.RopeScalingType.YARN
|
rope_scaling_type = gguf.RopeScalingType.YARN
|
||||||
n_orig_ctx = rope_scaling["original_max_position_embeddings"]
|
n_orig_ctx = rope_scaling['original_max_position_embeddings']
|
||||||
rope_finetuned = rope_scaling["finetuned"]
|
rope_finetuned = rope_scaling['finetuned']
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Unknown rope scaling type: {typ}")
|
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
|
||||||
|
|
||||||
if "max_sequence_length" in config:
|
if "max_sequence_length" in config:
|
||||||
n_ctx = config["max_sequence_length"]
|
n_ctx = config["max_sequence_length"]
|
||||||
elif "max_position_embeddings" in config:
|
elif "max_position_embeddings" in config:
|
||||||
n_ctx = config["max_position_embeddings"]
|
n_ctx = config["max_position_embeddings"]
|
||||||
else:
|
else:
|
||||||
raise Exception(
|
raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
||||||
"failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files."
|
|
||||||
)
|
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
|
|
||||||
if "num_local_experts" in config:
|
if "num_local_experts" in config:
|
||||||
@ -297,30 +244,30 @@ class Params:
|
|||||||
n_experts_used = config["num_experts_per_tok"]
|
n_experts_used = config["num_experts_per_tok"]
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=config["vocab_size"],
|
n_vocab = config["vocab_size"],
|
||||||
n_embd=config["hidden_size"],
|
n_embd = config["hidden_size"],
|
||||||
n_layer=config["num_hidden_layers"],
|
n_layer = config["num_hidden_layers"],
|
||||||
n_ctx=n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff=config["intermediate_size"],
|
n_ff = config["intermediate_size"],
|
||||||
n_head=(n_head := config["num_attention_heads"]),
|
n_head = (n_head := config["num_attention_heads"]),
|
||||||
n_head_kv=config.get("num_key_value_heads", n_head),
|
n_head_kv = config.get("num_key_value_heads", n_head),
|
||||||
n_experts=n_experts,
|
n_experts = n_experts,
|
||||||
n_experts_used=n_experts_used,
|
n_experts_used = n_experts_used,
|
||||||
f_norm_eps=config["rms_norm_eps"],
|
f_norm_eps = config["rms_norm_eps"],
|
||||||
f_rope_freq_base=config.get("rope_theta"),
|
f_rope_freq_base = config.get("rope_theta"),
|
||||||
rope_scaling_type=rope_scaling_type,
|
rope_scaling_type = rope_scaling_type,
|
||||||
f_rope_scale=f_rope_scale,
|
f_rope_scale = f_rope_scale,
|
||||||
n_orig_ctx=n_orig_ctx,
|
n_orig_ctx = n_orig_ctx,
|
||||||
rope_finetuned=rope_finetuned,
|
rope_finetuned = rope_finetuned,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
# {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
|
def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
f_rope_freq_base = None
|
f_rope_freq_base = None
|
||||||
|
|
||||||
@ -343,50 +290,50 @@ class Params:
|
|||||||
|
|
||||||
if config.get("moe"):
|
if config.get("moe"):
|
||||||
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
|
n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
|
||||||
n_experts = config["moe"]["num_experts"]
|
n_experts = config["moe"]["num_experts"]
|
||||||
n_experts_used = config["moe"]["num_experts_per_tok"]
|
n_experts_used = config["moe"]["num_experts_per_tok"]
|
||||||
f_rope_freq_base = 1e6
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab=model["tok_embeddings.weight"].shape[0],
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd=config["dim"],
|
n_embd = config["dim"],
|
||||||
n_layer=config["n_layers"],
|
n_layer = config["n_layers"],
|
||||||
n_ctx=n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff=n_ff,
|
n_ff = n_ff,
|
||||||
n_head=(n_head := config["n_heads"]),
|
n_head = (n_head := config["n_heads"]),
|
||||||
n_head_kv=config.get("n_kv_heads", n_head),
|
n_head_kv = config.get("n_kv_heads", n_head),
|
||||||
n_experts=n_experts,
|
n_experts = n_experts,
|
||||||
n_experts_used=n_experts_used,
|
n_experts_used = n_experts_used,
|
||||||
f_norm_eps=config["norm_eps"],
|
f_norm_eps = config["norm_eps"],
|
||||||
f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
|
f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load(model_plus: ModelPlus) -> "Params":
|
def load(model_plus: ModelPlus) -> Params:
|
||||||
hf_config_path = model_plus.paths[0].parent / "config.json"
|
hf_config_path = model_plus.paths[0].parent / "config.json"
|
||||||
orig_config_path = model_plus.paths[0].parent / "params.json"
|
orig_config_path = model_plus.paths[0].parent / "params.json"
|
||||||
|
|
||||||
if hf_config_path.exists():
|
if hf_config_path.exists():
|
||||||
params = Params.load_transformers_config(model_plus.model, hf_config_path)
|
params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
|
||||||
elif orig_config_path.exists():
|
elif orig_config_path.exists():
|
||||||
params = Params.load_torch_params(model_plus.model, orig_config_path)
|
params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
|
||||||
elif model_plus.format != "none":
|
elif model_plus.format != 'none':
|
||||||
params = Params.guessed(model_plus.model)
|
params = Params.guessed(model_plus.model)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Cannot guess params when model format is none")
|
raise ValueError('Cannot guess params when model format is none')
|
||||||
|
|
||||||
params.path_model = model_plus.paths[0].parent
|
params.path_model = model_plus.paths[0].parent
|
||||||
|
|
||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
class BpeVocab: # GPT
|
#
|
||||||
def __init__(
|
# vocab
|
||||||
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
|
#
|
||||||
) -> None:
|
|
||||||
self.bpe_tokenizer = json.loads(
|
class BpeVocab:
|
||||||
open(str(fname_tokenizer), encoding="utf-8").read()
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
)
|
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||||
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
self.vocab = self.bpe_tokenizer["model"]["vocab"]
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
@ -394,34 +341,31 @@ class BpeVocab: # GPT
|
|||||||
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
|
||||||
else:
|
else:
|
||||||
# Fall back to trying to find the added tokens in tokenizer.json
|
# Fall back to trying to find the added tokens in tokenizer.json
|
||||||
tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
|
tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
|
||||||
if not tokenizer_json_file.is_file():
|
if not tokenizer_json_file.is_file():
|
||||||
added_tokens = {}
|
added_tokens = {}
|
||||||
else:
|
else:
|
||||||
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
|
||||||
added_tokens = dict(
|
added_tokens = dict(
|
||||||
(item["content"], item["id"])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get("added_tokens", [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item["content"] not in self.bpe_tokenizer
|
if item['content'] not in self.bpe_tokenizer)
|
||||||
)
|
|
||||||
|
|
||||||
vocab_size: int = len(self.vocab)
|
vocab_size: int = len(self.vocab)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
actual_ids = sorted(added_tokens.values())
|
actual_ids = sorted(added_tokens.values())
|
||||||
if expected_ids != actual_ids:
|
if expected_ids != actual_ids:
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||||
raise Exception(
|
raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||||
f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
|
|
||||||
)
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
self.vocab_size_base: int = vocab_size
|
self.vocab_size_base: int = vocab_size
|
||||||
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
@ -442,10 +386,8 @@ class BpeVocab: # GPT
|
|||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab: # LlaMa
|
class SentencePieceVocab:
|
||||||
def __init__(
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||||
self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
|
|
||||||
) -> None:
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||||
added_tokens: dict[str, int]
|
added_tokens: dict[str, int]
|
||||||
if fname_added_tokens is not None:
|
if fname_added_tokens is not None:
|
||||||
@ -455,23 +397,19 @@ class SentencePieceVocab: # LlaMa
|
|||||||
|
|
||||||
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
id: piece for piece, id in added_tokens.items() if id >= vocab_size
|
|
||||||
}
|
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
actual_new_ids = sorted(new_tokens.keys())
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
|
|
||||||
if expected_new_ids != actual_new_ids:
|
if expected_new_ids != actual_new_ids:
|
||||||
raise ValueError(
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
# Token pieces that were added to the base vocabulary.
|
||||||
self.added_tokens_dict = added_tokens
|
self.added_tokens_dict = added_tokens
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
self.vocab_size_base = vocab_size
|
self.vocab_size_base = vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
@ -512,11 +450,15 @@ class SentencePieceVocab: # LlaMa
|
|||||||
|
|
||||||
|
|
||||||
class HfVocab:
|
class HfVocab:
|
||||||
def __init__(
|
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
||||||
self,
|
try:
|
||||||
fname_tokenizer: Path,
|
from transformers import AutoTokenizer
|
||||||
fname_added_tokens: Optional[Path] = None,
|
except ImportError as e:
|
||||||
) -> None:
|
raise ImportError(
|
||||||
|
"To use HfVocab, please install the `transformers` package. "
|
||||||
|
"You can install it with `pip install transformers`."
|
||||||
|
) from e
|
||||||
|
|
||||||
print("fname_tokenizer:", fname_tokenizer)
|
print("fname_tokenizer:", fname_tokenizer)
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
# Explicitly set tokenizer to use local paths.
|
# Explicitly set tokenizer to use local paths.
|
||||||
@ -529,7 +471,7 @@ class HfVocab:
|
|||||||
# Initialize lists and dictionaries for added tokens
|
# Initialize lists and dictionaries for added tokens
|
||||||
self.added_tokens_list = []
|
self.added_tokens_list = []
|
||||||
self.added_tokens_dict = dict()
|
self.added_tokens_dict = dict()
|
||||||
self.added_tokens_ids = set()
|
self.added_tokens_ids = set()
|
||||||
|
|
||||||
# Process added tokens
|
# Process added tokens
|
||||||
for tok, tokidx in sorted(
|
for tok, tokidx in sorted(
|
||||||
@ -550,12 +492,12 @@ class HfVocab:
|
|||||||
|
|
||||||
# Set vocabulary sizes
|
# Set vocabulary sizes
|
||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
self.vocab_size_base = self.tokenizer.vocab_size
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
self.fname_tokenizer = fname_tokenizer
|
||||||
self.fname_added_tokens = fname_added_tokens
|
self.fname_added_tokens = fname_added_tokens
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
reverse_vocab = {
|
reverse_vocab = {
|
||||||
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
||||||
}
|
}
|
||||||
@ -573,11 +515,9 @@ class HfVocab:
|
|||||||
token_id, self.special_ids # Reuse already stored special IDs
|
token_id, self.special_ids # Reuse already stored special IDs
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
|
def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
|
||||||
# Determine token type based on whether it's a special token
|
# Determine token type based on whether it's a special token
|
||||||
return (
|
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||||
gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_token_score(self, token_id: int) -> float:
|
def get_token_score(self, token_id: int) -> float:
|
||||||
# Placeholder for actual logic to determine the token's score
|
# Placeholder for actual logic to determine the token's score
|
||||||
@ -589,7 +529,6 @@ class HfVocab:
|
|||||||
if text in self.specials:
|
if text in self.specials:
|
||||||
toktype = self.get_token_type(self.specials[text], self.special_ids)
|
toktype = self.get_token_type(self.specials[text], self.special_ids)
|
||||||
score = self.get_token_score(self.specials[text])
|
score = self.get_token_score(self.specials[text])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
toktype = gguf.TokenType.USER_DEFINED
|
toktype = gguf.TokenType.USER_DEFINED
|
||||||
score = -1000.0
|
score = -1000.0
|
||||||
@ -783,7 +722,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
|||||||
else:
|
else:
|
||||||
model = merge_sharded([mp.model for mp in models_plus])
|
model = merge_sharded([mp.model for mp in models_plus])
|
||||||
|
|
||||||
return ModelPlus(model, paths, format, vocab)
|
return ModelPlus(model, paths, format, vocab) # pytype: disable=wrong-arg-types
|
||||||
|
|
||||||
|
|
||||||
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
@ -871,17 +810,13 @@ class LazyUnpickler(pickle.Unpickler):
|
|||||||
CLASSES: dict[tuple[str, str], Any] = {
|
CLASSES: dict[tuple[str, str], Any] = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
("torch._tensor", "_rebuild_from_type_v2"): getattr(
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
rebuild_from_type_v2, "__func__"
|
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
||||||
),
|
('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
|
||||||
("torch._utils", "_rebuild_tensor_v2"): getattr(
|
('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
|
||||||
lazy_rebuild_tensor_v2, "__func__"
|
('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
|
||||||
),
|
('torch', 'IntStorage'): LazyStorageKind(DT_I32),
|
||||||
("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
|
('torch', 'Tensor'): LazyTensor,
|
||||||
("torch", "HalfStorage"): LazyStorageKind(DT_F16),
|
|
||||||
("torch", "FloatStorage"): LazyStorageKind(DT_F32),
|
|
||||||
("torch", "IntStorage"): LazyStorageKind(DT_I32),
|
|
||||||
("torch", "Tensor"): LazyTensor,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def find_class(self, module: str, name: str) -> Any:
|
def find_class(self, module: str, name: str) -> Any:
|
||||||
@ -968,7 +903,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||||||
executor_class = ProcessPoolExecutor
|
executor_class = ProcessPoolExecutor
|
||||||
else:
|
else:
|
||||||
executor_class = ThreadPoolExecutor
|
executor_class = ThreadPoolExecutor
|
||||||
with executor_class(max_workers = max_workers) as executor:
|
with executor_class(max_workers=max_workers) as executor:
|
||||||
futures: list[concurrent.futures.Future[Out]] = []
|
futures: list[concurrent.futures.Future[Out]] = []
|
||||||
done = False
|
done = False
|
||||||
for _ in range(concurrency):
|
for _ in range(concurrency):
|
||||||
@ -1022,12 +957,8 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
|||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
) -> None:
|
|
||||||
self.gguf = gguf.GGUFWriter(
|
|
||||||
fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
name = "LLaMA"
|
name = "LLaMA"
|
||||||
@ -1036,21 +967,16 @@ class OutputFile:
|
|||||||
if params.n_ctx == 4096:
|
if params.n_ctx == 4096:
|
||||||
name = "LLaMA v2"
|
name = "LLaMA v2"
|
||||||
elif params.path_model is not None:
|
elif params.path_model is not None:
|
||||||
name = str(params.path_model.parent).split("/")[-1]
|
name = str(params.path_model.parent).split('/')[-1]
|
||||||
|
|
||||||
self.gguf.add_name(name)
|
self.gguf.add_name (name)
|
||||||
self.gguf.add_context_length(params.n_ctx)
|
self.gguf.add_context_length (params.n_ctx)
|
||||||
self.gguf.add_embedding_length(params.n_embd)
|
self.gguf.add_embedding_length (params.n_embd)
|
||||||
self.gguf.add_block_count(params.n_layer)
|
self.gguf.add_block_count (params.n_layer)
|
||||||
self.gguf.add_feed_forward_length(params.n_ff)
|
self.gguf.add_feed_forward_length (params.n_ff)
|
||||||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||||
self.gguf.add_head_count(params.n_head)
|
self.gguf.add_head_count (params.n_head)
|
||||||
self.gguf.add_head_count_kv(params.n_head_kv)
|
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||||
|
|
||||||
if params.f_norm_eps is None:
|
|
||||||
raise ValueError("f_norm_eps is None")
|
|
||||||
|
|
||||||
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
|
|
||||||
|
|
||||||
if params.n_experts:
|
if params.n_experts:
|
||||||
self.gguf.add_expert_count(params.n_experts)
|
self.gguf.add_expert_count(params.n_experts)
|
||||||
@ -1058,6 +984,11 @@ class OutputFile:
|
|||||||
if params.n_experts_used:
|
if params.n_experts_used:
|
||||||
self.gguf.add_expert_used_count(params.n_experts_used)
|
self.gguf.add_expert_used_count(params.n_experts_used)
|
||||||
|
|
||||||
|
if params.f_norm_eps:
|
||||||
|
self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
|
||||||
|
else:
|
||||||
|
raise ValueError('f_norm_eps is None')
|
||||||
|
|
||||||
if params.f_rope_freq_base is not None:
|
if params.f_rope_freq_base is not None:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
|
|
||||||
@ -1089,7 +1020,7 @@ class OutputFile:
|
|||||||
|
|
||||||
return tokenizer_model
|
return tokenizer_model
|
||||||
|
|
||||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
|
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
@ -1124,14 +1055,10 @@ class OutputFile:
|
|||||||
|
|
||||||
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
||||||
n_elements = int(np.prod(tensor.shape))
|
n_elements = int(np.prod(tensor.shape))
|
||||||
raw_dtype = getattr(tensor.data_type, "ggml_type", None)
|
raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
|
||||||
data_type = (
|
data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
|
||||||
getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
|
|
||||||
)
|
|
||||||
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
|
||||||
self.gguf.add_tensor_info(
|
self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
|
||||||
name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
def write_meta(self) -> None:
|
def write_meta(self) -> None:
|
||||||
self.gguf.write_header_to_file()
|
self.gguf.write_header_to_file()
|
||||||
@ -1145,14 +1072,10 @@ class OutputFile:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(
|
def write_vocab_only(
|
||||||
fname_out: Path,
|
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
params: Params,
|
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||||
vocab: Vocab,
|
|
||||||
svocab: gguf.SpecialVocab,
|
|
||||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
|
||||||
pad_vocab: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
|
||||||
@ -1180,14 +1103,8 @@ class OutputFile:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_all(
|
def write_all(
|
||||||
fname_out: Path,
|
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||||
ftype: GGMLFileType,
|
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||||
params: Params,
|
|
||||||
model: LazyModel,
|
|
||||||
vocab: Vocab,
|
|
||||||
svocab: gguf.SpecialVocab,
|
|
||||||
concurrency: int = DEFAULT_CONCURRENCY,
|
|
||||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
|
||||||
pad_vocab: bool = False,
|
pad_vocab: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||||
@ -1207,26 +1124,19 @@ class OutputFile:
|
|||||||
of.write_tensor_info()
|
of.write_tensor_info()
|
||||||
|
|
||||||
# tensor data
|
# tensor data
|
||||||
ndarrays_inner = bounded_parallel_map(
|
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
||||||
OutputFile.do_item, model.items(), concurrency=concurrency
|
|
||||||
)
|
|
||||||
if ftype == GGMLFileType.MostlyQ8_0:
|
if ftype == GGMLFileType.MostlyQ8_0:
|
||||||
ndarrays = bounded_parallel_map(
|
ndarrays = bounded_parallel_map(
|
||||||
OutputFile.maybe_do_quantize,
|
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
||||||
ndarrays_inner,
|
|
||||||
concurrency=concurrency,
|
|
||||||
max_workers=concurrency,
|
|
||||||
use_processpool_executor=True,
|
use_processpool_executor=True,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||||
|
|
||||||
start = time.time()
|
start = time.time()
|
||||||
for i, ((name, lazy_tensor), ndarray) in enumerate(
|
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||||
zip(model.items(), ndarrays)
|
|
||||||
):
|
|
||||||
elapsed = time.time() - start
|
elapsed = time.time() - start
|
||||||
size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||||
padi = len(str(len(model)))
|
padi = len(str(len(model)))
|
||||||
print(
|
print(
|
||||||
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||||
@ -1363,7 +1273,7 @@ def load_some_model(path: Path) -> ModelPlus:
|
|||||||
class VocabFactory:
|
class VocabFactory:
|
||||||
def __init__(self, path: Path):
|
def __init__(self, path: Path):
|
||||||
self.path = path
|
self.path = path
|
||||||
self.files = {
|
self.files: dict[str, Path | None] = {
|
||||||
"tokenizer.model": None,
|
"tokenizer.model": None,
|
||||||
"vocab.json": None,
|
"vocab.json": None,
|
||||||
"tokenizer.json": None,
|
"tokenizer.json": None,
|
||||||
@ -1380,24 +1290,18 @@ class VocabFactory:
|
|||||||
self.files[file] = parent_file_path
|
self.files[file] = parent_file_path
|
||||||
print(f"Found vocab files: {self.files}")
|
print(f"Found vocab files: {self.files}")
|
||||||
|
|
||||||
def _select_file(self, vocabtype: Optional[str]) -> Path:
|
def _select_file(self, vocabtype: str | None) -> Path:
|
||||||
if vocabtype in ["spm", "bpe"]:
|
if vocabtype in ["spm", "bpe"]:
|
||||||
for file_key in self.files.keys():
|
for file_key in self.files.keys():
|
||||||
if self.files[file_key]:
|
if (file := self.files[file_key]) is not None:
|
||||||
return self.files[file_key]
|
return file
|
||||||
raise FileNotFoundError(f"{vocabtype} vocab not found.")
|
raise FileNotFoundError(f"{vocabtype} vocab not found.")
|
||||||
elif vocabtype == "hfft":
|
if vocabtype == "hfft":
|
||||||
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
|
||||||
return self.path
|
return self.path
|
||||||
else:
|
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
|
||||||
|
|
||||||
def _create_special_vocab(
|
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||||
self,
|
|
||||||
vocab: Vocab,
|
|
||||||
vocabtype: str,
|
|
||||||
model_parent_path: Path,
|
|
||||||
) -> gguf.SpecialVocab:
|
|
||||||
load_merges = vocabtype == "bpe"
|
load_merges = vocabtype == "bpe"
|
||||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
||||||
return gguf.SpecialVocab(
|
return gguf.SpecialVocab(
|
||||||
@ -1407,13 +1311,12 @@ class VocabFactory:
|
|||||||
n_vocab=n_vocab,
|
n_vocab=n_vocab,
|
||||||
)
|
)
|
||||||
|
|
||||||
def load_vocab(
|
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
||||||
self, vocabtype: str, model_parent_path: Path
|
|
||||||
) -> Tuple[Vocab, gguf.SpecialVocab]:
|
|
||||||
path = self._select_file(vocabtype)
|
path = self._select_file(vocabtype)
|
||||||
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
print(f"Loading vocab file '{path}', type '{vocabtype}'")
|
||||||
|
|
||||||
added_tokens_path = path.parent / "added_tokens.json"
|
added_tokens_path = path.parent / "added_tokens.json"
|
||||||
|
vocab: Vocab
|
||||||
if vocabtype == "bpe":
|
if vocabtype == "bpe":
|
||||||
vocab = BpeVocab(
|
vocab = BpeVocab(
|
||||||
path, added_tokens_path if added_tokens_path.exists() else None
|
path, added_tokens_path if added_tokens_path.exists() else None
|
||||||
@ -1428,6 +1331,7 @@ class VocabFactory:
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
raise ValueError(f"Unsupported vocabulary type {vocabtype}")
|
||||||
|
# FIXME: Respect --vocab-dir?
|
||||||
special_vocab = self._create_special_vocab(
|
special_vocab = self._create_special_vocab(
|
||||||
vocab,
|
vocab,
|
||||||
vocabtype,
|
vocabtype,
|
||||||
@ -1436,18 +1340,17 @@ class VocabFactory:
|
|||||||
return vocab, special_vocab
|
return vocab, special_vocab
|
||||||
|
|
||||||
|
|
||||||
def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||||
namestr = {
|
namestr = {
|
||||||
GGMLFileType.AllF32: "f32",
|
GGMLFileType.AllF32: "f32",
|
||||||
GGMLFileType.MostlyF16: "f16",
|
GGMLFileType.MostlyF16: "f16",
|
||||||
GGMLFileType.MostlyQ8_0: "q8_0",
|
GGMLFileType.MostlyQ8_0:"q8_0",
|
||||||
}[file_type]
|
}[file_type]
|
||||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||||
if ret in model_paths:
|
if ret in model_paths:
|
||||||
sys.stderr.write(
|
sys.stderr.write(
|
||||||
f"Error: Default output path ({ret}) would overwrite the input. "
|
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||||
"Please explicitly specify a path using --outfile.\n"
|
"Please explicitly specify a path using --outfile.\n")
|
||||||
)
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
@ -1457,111 +1360,34 @@ def do_dump_model(model_plus: ModelPlus) -> None:
|
|||||||
print(f"model_plus.format = {model_plus.format!r}")
|
print(f"model_plus.format = {model_plus.format!r}")
|
||||||
print(f"model_plus.vocab = {model_plus.vocab!r}")
|
print(f"model_plus.vocab = {model_plus.vocab!r}")
|
||||||
for name, lazy_tensor in model_plus.model.items():
|
for name, lazy_tensor in model_plus.model.items():
|
||||||
print(
|
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
|
||||||
f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_argument_parser() -> ArgumentParser:
|
def main(args_in: list[str] | None = None) -> None:
|
||||||
output_choices = ["f32", "f16"]
|
output_choices = ["f32", "f16"]
|
||||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||||
# We currently only support Q8_0 output on little endian systems.
|
# We currently only support Q8_0 output on little endian systems.
|
||||||
output_choices.append("q8_0")
|
output_choices.append("q8_0")
|
||||||
|
vocab_types = ["spm", "bpe", "hfft"]
|
||||||
|
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
||||||
|
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
|
||||||
|
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||||
|
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||||
|
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||||
|
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||||
|
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||||
|
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
|
||||||
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
|
||||||
|
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
|
||||||
|
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
|
||||||
|
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
|
||||||
|
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
args = parser.parse_args(args_in)
|
||||||
description="Convert a LLaMa model to a GGML compatible file"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"model",
|
|
||||||
type=Path,
|
|
||||||
help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--awq-path",
|
|
||||||
type=Path,
|
|
||||||
help="Path to the Activation-aware Weight Quantization cache file",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dump",
|
|
||||||
action="store_true",
|
|
||||||
help="Display the model content without converting it",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--dump-single",
|
|
||||||
action="store_true",
|
|
||||||
help="Display the content of a single model file without conversion",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only",
|
|
||||||
action="store_true",
|
|
||||||
help="Extract and output only the vocabulary",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--outtype",
|
|
||||||
choices=output_choices,
|
|
||||||
help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-dir",
|
|
||||||
type=Path,
|
|
||||||
help="Directory containing the tokenizer.model, if separate from the model file",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-type",
|
|
||||||
choices=["spm", "bpe", "hfft"], # hfft: Hugging Face Fast Tokenizer
|
|
||||||
default="spm",
|
|
||||||
help="The vocabulary format used to define the tokenizer model (default: spm)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--pad-vocab",
|
|
||||||
action="store_true",
|
|
||||||
help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile",
|
|
||||||
type=Path,
|
|
||||||
help="Specify the path for the output file (default is based on input)",
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--ctx", type=int, help="Model training context (default is based on input)"
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--concurrency",
|
|
||||||
type=int,
|
|
||||||
help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
|
|
||||||
default=DEFAULT_CONCURRENCY,
|
|
||||||
)
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
"--big-endian",
|
|
||||||
action="store_true",
|
|
||||||
help="Indicate that the model is executed on a big-endian machine",
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def main(argv: Optional[list[str]] = None) -> None:
|
|
||||||
parser = get_argument_parser()
|
|
||||||
args = parser.parse_args(argv)
|
|
||||||
|
|
||||||
if args.awq_path:
|
if args.awq_path:
|
||||||
sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
|
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||||
from awq.apply_awq import add_scale_weights
|
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
||||||
|
|
||||||
tmp_model_path = args.model / "weighted_model"
|
tmp_model_path = args.model / "weighted_model"
|
||||||
if tmp_model_path.is_dir():
|
if tmp_model_path.is_dir():
|
||||||
print(f"{tmp_model_path} exists as a weighted model.")
|
print(f"{tmp_model_path} exists as a weighted model.")
|
||||||
@ -1580,14 +1406,11 @@ def main(argv: Optional[list[str]] = None) -> None:
|
|||||||
if not args.vocab_only:
|
if not args.vocab_only:
|
||||||
model_plus = load_some_model(args.model)
|
model_plus = load_some_model(args.model)
|
||||||
else:
|
else:
|
||||||
model_plus = ModelPlus(
|
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||||
model={}, paths=[args.model / "dummy"], format="none", vocab=None
|
|
||||||
)
|
|
||||||
|
|
||||||
if args.dump:
|
if args.dump:
|
||||||
do_dump_model(model_plus)
|
do_dump_model(model_plus)
|
||||||
return
|
return
|
||||||
|
|
||||||
endianess = gguf.GGUFEndian.LITTLE
|
endianess = gguf.GGUFEndian.LITTLE
|
||||||
if args.big_endian:
|
if args.big_endian:
|
||||||
endianess = gguf.GGUFEndian.BIG
|
endianess = gguf.GGUFEndian.BIG
|
||||||
@ -1595,12 +1418,10 @@ def main(argv: Optional[list[str]] = None) -> None:
|
|||||||
params = Params.load(model_plus)
|
params = Params.load(model_plus)
|
||||||
if params.n_ctx == -1:
|
if params.n_ctx == -1:
|
||||||
if args.ctx is None:
|
if args.ctx is None:
|
||||||
raise Exception(
|
raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
|
||||||
"The model doesn't have a context size, and you didn't specify one with --ctx\n"
|
"Please specify one with --ctx:\n"
|
||||||
"Please specify one with --ctx:\n"
|
" - LLaMA v1: --ctx 2048\n"
|
||||||
" - LLaMA v1: --ctx 2048\n"
|
" - LLaMA v2: --ctx 4096\n")
|
||||||
" - LLaMA v2: --ctx 4096\n"
|
|
||||||
)
|
|
||||||
params.n_ctx = args.ctx
|
params.n_ctx = args.ctx
|
||||||
|
|
||||||
if args.outtype:
|
if args.outtype:
|
||||||
@ -1621,42 +1442,30 @@ def main(argv: Optional[list[str]] = None) -> None:
|
|||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
outfile,
|
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||||
params,
|
|
||||||
vocab,
|
|
||||||
special_vocab,
|
|
||||||
endianess=endianess,
|
|
||||||
pad_vocab=args.pad_vocab,
|
|
||||||
)
|
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
return
|
return
|
||||||
|
|
||||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
if model_plus.vocab is not None and args.vocab_dir is None:
|
||||||
vocab = model_plus.vocab
|
vocab = model_plus.vocab
|
||||||
|
|
||||||
model = model_plus.model
|
print(f"Vocab info: {vocab}")
|
||||||
model = convert_model_names(model, params)
|
print(f"Special vocab info: {special_vocab}")
|
||||||
ftype = pick_output_type(model, args.outtype)
|
|
||||||
model = convert_to_output_type(model, ftype)
|
model = model_plus.model
|
||||||
outfile = args.outfile or default_output_file(model_plus.paths, ftype)
|
model = convert_model_names(model, params)
|
||||||
|
ftype = pick_output_type(model, args.outtype)
|
||||||
|
model = convert_to_output_type(model, ftype)
|
||||||
|
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||||
|
|
||||||
params.ftype = ftype
|
params.ftype = ftype
|
||||||
print(f"Writing {outfile}, format {ftype}")
|
print(f"Writing {outfile}, format {ftype}")
|
||||||
|
|
||||||
OutputFile.write_all(
|
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||||
outfile,
|
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||||
ftype,
|
|
||||||
params,
|
|
||||||
model,
|
|
||||||
vocab,
|
|
||||||
special_vocab,
|
|
||||||
concurrency=args.concurrency,
|
|
||||||
endianess=endianess,
|
|
||||||
pad_vocab=args.pad_vocab,
|
|
||||||
)
|
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == '__main__':
|
||||||
main(sys.argv[1:]) # Exclude the first element (script name) from sys.argv
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user