mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 19:04:35 +00:00
ci : add flake8 to github actions (python linting) (#4129)
Disabled rules: * E203 Whitespace before ':' - disabled because we often use 'C' Style where values are aligned * E211 Whitespace before '(' (E211) - disabled because we often use 'C' Style where values are aligned * E221 Multiple spaces before operator - disabled because we often use 'C' Style where values are aligned * E225 Missing whitespace around operator - disabled because it's broken so often it seems like a standard * E231 Missing whitespace after ',', ';', or ':' - disabled because we often use 'C' Style where values are aligned * E241 Multiple spaces after ',' - disabled because we often use 'C' Style where values are aligned * E251 Unexpected spaces around keyword / parameter equals - disabled because it's broken so often it seems like a standard * E261 At least two spaces before inline comment - disabled because it's broken so often it seems like a standard * E266 Too many leading '#' for block comment - sometimes used as "section" separator * E501 Line too long - disabled because it's broken so often it seems like a standard * E701 Multiple statements on one line (colon) - broken only in convert.py when defining abstract methods (we can use# noqa instead) * E704 Multiple statements on one line - broken only in convert.py when defining abstract methods (we can use# noqa instead)
This commit is contained in:
parent
40a34fe8d0
commit
f23c0359a3
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
name: flake8 Lint
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
flake8-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
name: Lint
|
||||||
|
steps:
|
||||||
|
- name: Check out source repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Set up Python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: flake8 Lint
|
||||||
|
uses: py-actions/flake8@v2
|
||||||
|
with:
|
||||||
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
@ -827,13 +827,14 @@ class StableLMModel(Model):
|
|||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -14,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
class GGMLFormat(IntEnum):
|
class GGMLFormat(IntEnum):
|
||||||
GGML = 0
|
GGML = 0
|
||||||
GGMF = 1
|
GGMF = 1
|
||||||
GGJT = 2
|
GGJT = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLFType(IntEnum):
|
class GGMLFType(IntEnum):
|
||||||
ALL_F32 = 0
|
ALL_F32 = 0
|
||||||
MOSTLY_F16 = 1
|
MOSTLY_F16 = 1
|
||||||
@ -38,6 +40,7 @@ class GGMLFType(IntEnum):
|
|||||||
MOSTLY_Q5_K_M = 17
|
MOSTLY_Q5_K_M = 17
|
||||||
MOSTLY_Q6_K = 18
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
@ -69,6 +72,7 @@ class Hyperparameters:
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self, load_scores = True):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
@ -90,6 +94,7 @@ class Vocab:
|
|||||||
self.items.append((item_text, item_score))
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, use_padding = True):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
@ -123,6 +128,7 @@ class Tensor:
|
|||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
@ -159,7 +165,7 @@ class GGMLModel:
|
|||||||
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
if len(err) > 0:
|
if len(err) > 0:
|
||||||
@ -187,6 +193,7 @@ class GGMLModel:
|
|||||||
hp.set_n_ff(self)
|
hp.set_n_ff(self)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
@ -217,7 +224,7 @@ class GGMLToGGUF:
|
|||||||
gguf_writer = gguf.GGUFWriter(
|
gguf_writer = gguf.GGUFWriter(
|
||||||
self.cfg.output,
|
self.cfg.output,
|
||||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
use_temp_file = False )
|
use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
@ -341,7 +348,8 @@ class GGMLToGGUF:
|
|||||||
mapped_name,
|
mapped_name,
|
||||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
raw_shape = tempdims,
|
raw_shape = tempdims,
|
||||||
raw_dtype = tensor.dtype )
|
raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
@ -365,7 +373,7 @@ def handle_metadata(cfg, hp):
|
|||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(
|
vocab = convert.load_vocab(
|
||||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
cfg.vocabtype )
|
cfg.vocabtype)
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||||
load_merges = cfg.vocabtype == 'bpe',
|
load_merges = cfg.vocabtype == 'bpe',
|
||||||
@ -373,6 +381,7 @@ def handle_metadata(cfg, hp):
|
|||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
@ -397,6 +406,7 @@ def handle_args():
|
|||||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
@ -406,7 +416,7 @@ def main():
|
|||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLModel()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0) # noqa
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
@ -421,12 +431,15 @@ def main():
|
|||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
if model.file_format == GGMLFormat.GGML:
|
if model.file_format == GGMLFormat.GGML:
|
||||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
converter = GGMLToGGUF(model, data, cfg,
|
converter = GGMLToGGUF(
|
||||||
|
model, data, cfg,
|
||||||
params_override = params_override,
|
params_override = params_override,
|
||||||
vocab_override = vocab_override,
|
vocab_override = vocab_override,
|
||||||
special_vocab = special_vocab )
|
special_vocab = special_vocab
|
||||||
|
)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
assert isinstance(dct, dict)
|
assert isinstance(dct, dict)
|
||||||
for key in dct.keys():
|
for key in dct.keys():
|
||||||
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
|||||||
raise ValueError(type(dct[key]))
|
raise ValueError(type(dct[key]))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
|||||||
pass
|
pass
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
@ -125,6 +128,5 @@ def main():
|
|||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
40
convert.py
Executable file → Normal file
40
convert.py
Executable file → Normal file
@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
|||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
@ -55,15 +56,18 @@ class DataType:
|
|||||||
def elements_to_bytes(self, n_elements: int) -> int:
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
return n_elements * self.dtype.itemsize
|
return n_elements * self.dtype.itemsize
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType(DataType):
|
class UnquantizedDataType(DataType):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class QuantizedDataType(DataType):
|
class QuantizedDataType(DataType):
|
||||||
block_size: int
|
block_size: int
|
||||||
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
|
|||||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
# Mini Q8_0 quantization in Python!
|
# Mini Q8_0 quantization in Python!
|
||||||
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||||||
n_blocks = arr.size // self.block_size
|
n_blocks = arr.size // self.block_size
|
||||||
blocks = arr.reshape((n_blocks, self.block_size))
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
|
||||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
with np.errstate(divide = 'ignore'):
|
with np.errstate(divide = 'ignore'):
|
||||||
@ -94,6 +100,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
|||||||
yield from zip(d, qs)
|
yield from zip(d, qs)
|
||||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
|
||||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
|||||||
# TODO: match this with `llama_ftype`
|
# TODO: match this with `llama_ftype`
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
|
|
||||||
|
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
|||||||
# 1D tensors are always F32.
|
# 1D tensors are always F32.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
GGMLFileType.AllF32 : DT_F32,
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
GGMLFileType.MostlyF16 : DT_F16,
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
|||||||
# hparams loading
|
# hparams loading
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
@ -167,11 +178,11 @@ class Params:
|
|||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
@ -308,7 +319,7 @@ class BpeVocab:
|
|||||||
(item['content'], item['id'])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in self.bpe_tokenizer )
|
if item['content'] not in self.bpe_tokenizer)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.bpe_tokenizer)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
@ -326,7 +337,6 @@ class BpeVocab:
|
|||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
@ -406,6 +416,7 @@ class SentencePieceVocab:
|
|||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -413,8 +424,9 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
|||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
n_head = n_head_kv
|
n_head = n_head_kv
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
|||||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||||
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
|||||||
s[0] = s[0] // 3
|
s[0] = s[0] // 3
|
||||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().part(n_part)
|
return lazy_tensor.load().part(n_part)
|
||||||
@ -744,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
|||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
@ -778,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
|||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
@ -796,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
|||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
@ -876,7 +892,7 @@ class OutputFile:
|
|||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
@ -938,8 +954,9 @@ class OutputFile:
|
|||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
@ -952,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
@ -968,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
|||||||
print(f"Permuting layer {i}")
|
print(f"Permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
print(f"Unpacking and permuting layer {i}")
|
print(f"Unpacking and permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||||
@ -993,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
|
@ -221,7 +221,7 @@ class GGUFWriter:
|
|||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
if self.use_temp_file and self.temp_file is None:
|
if self.use_temp_file and self.temp_file is None:
|
||||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||||
fp.seek(0)
|
fp.seek(0)
|
||||||
self.temp_file = fp
|
self.temp_file = fp
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ tests = [
|
|||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
"\n =",
|
"\n =",
|
||||||
"' era",
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
print('text: ', text)
|
print('text: ', text)
|
||||||
|
@ -39,7 +39,7 @@ tests = [
|
|||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
Loading…
Reference in New Issue
Block a user