convert-gptneox-h5-to-gguf.py : load model in parts to save memory

This commit is contained in:
klosax 2023-08-13 12:18:34 +02:00 committed by GitHub
parent 9bf5a7efcb
commit e3d1f07eb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,4 @@
# Quick and dirty HF gptneox--> gguf conversion # HF gptneox--> gguf conversion
import gguf import gguf
import gguf_tensor_map as tmap import gguf_tensor_map as tmap
@ -9,7 +9,8 @@ import json
import numpy as np import numpy as np
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
from transformers import AutoTokenizer, AutoModelForCausalLM import torch
from transformers import AutoTokenizer
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode(): def bytes_to_unicode():
@ -33,6 +34,15 @@ def bytes_to_unicode():
cs = [chr(n) for n in cs] cs = [chr(n) for n in cs]
return dict(zip(bs, cs)) return dict(zip(bs, cs))
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
if len(sys.argv) < 3: if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model ftype\n") print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
@ -70,9 +80,8 @@ if hparams["architectures"][0] != "GPTNeoXForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0] ) print("Model architecture not supported: " + hparams["architectures"][0] )
sys.exit() sys.exit()
# get number of model parts
model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) num_parts = count_model_parts(dir_model)
list_vars = model.state_dict()
gguf_writer = gguf.GGUFWriter.open(fname_out) gguf_writer = gguf.GGUFWriter.open(fname_out)
@ -183,37 +192,58 @@ tensor_map = tmap.get_tensor_map(block_count)
# tensor info # tensor info
print("gguf: get tensor metadata") print("gguf: get tensor metadata")
for name in list_vars.keys(): if num_parts == 0:
data = list_vars[name].squeeze().numpy() part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
# we don't need these for part_name in part_names:
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): print("gguf: loading model part '"+ part_name + "'")
continue model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
# map tensor names for name in model_part.keys():
if name.endswith(".weight") and name[:-7] in tensor_map: data = model_part[name]
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape) # we don't need these
data_dtype = data.dtype if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
# print( name + " dims " + str(n_dims) + " dtype " + str(data.dtype) )
if data.dtype != np.float16 and data.dtype != np.float32:
# convert any unsupported data types to float32 # convert any unsupported data types to float32
data_dtype = np.float32 if data.dtype != torch.float16 and data.dtype != torch.float32:
elif ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data_dtype = np.float32
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data.dtype == np.float16 and n_dims == 1:
data_dtype = np.float32
# if f16 desired, convert any float32 2-dim weight tensors to float16 # if f16 desired, convert any float32 2-dim weight tensors to float16
data_dtype = np.float16 if ftype == 1 and data.dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data_dtype = np.float16
data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4 data_nbytes = data.size * 2 if data_dtype == np.float16 else data.size * 4
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
gguf_writer.add_tensor_info(name, data.shape, data_dtype, data_nbytes)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
@ -225,24 +255,59 @@ gguf_writer.write_ti_data_to_file()
# tensor data # tensor data
print("gguf: convert and write tensor data") print("gguf: convert and write tensor data")
for name in list_vars.keys(): if num_parts == 0:
data = list_vars[name].squeeze().numpy() part_names = ("pytorch_model.bin",)
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
# we don't need these for part_name in part_names:
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): print("gguf: loading model part '"+ part_name + "'")
continue model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
n_dims = len(data.shape) for name in model_part.keys():
data_dtype = data.dtype data = model_part[name]
old_dtype = data.dtype
# we don't need these
if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
continue
if data_dtype != np.float16 and data_dtype != np.float32:
# convert any unsupported data types to float32 # convert any unsupported data types to float32
data = data.astype(np.float32) if data.dtype != torch.float16 and data.dtype != torch.float32:
elif ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.to(torch.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
data = data.astype(np.float16)
gguf_writer.write_tensor_to_file(data) data = data.squeeze().numpy()
# map tensor names
if name.endswith(".weight") and name[:-7] in tensor_map:
name = tensor_map[name[:-7]] + ".weight"
elif name.endswith(".bias") and name[:-5] in tensor_map:
name = tensor_map[name[:-5]] + ".bias"
else:
print( "Can not map tensor '" + name + "'" )
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.write_tensor_to_file(data)
gguf_writer.close() gguf_writer.close()