convert : automatically fall back to HfVocab if needed

This commit is contained in:
Jared Van Bortel 2024-03-01 12:08:54 -05:00
parent e743386728
commit 17d22efa40
2 changed files with 36 additions and 40 deletions

View File

@ -373,7 +373,7 @@ def handle_metadata(cfg, hp):
raise ValueError('Unable to load metadata') raise ValueError('Unable to load metadata')
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
vocab_factory = convert.VocabFactory(vocab_path) vocab_factory = convert.VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir)
convert.check_vocab_size(params, vocab) convert.check_vocab_size(params, vocab)
return params, vocab, special_vocab return params, vocab, special_vocab
@ -398,8 +398,8 @@ def handle_args():
help ='Load HuggingFace/.pth vocab and metadata from the specified directory') help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
parser.add_argument("--vocab-dir", type=Path, parser.add_argument("--vocab-dir", type=Path,
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
return parser.parse_args() return parser.parse_args()

View File

@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus:
class VocabFactory: class VocabFactory:
_FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"}
def __init__(self, path: Path): def __init__(self, path: Path):
self.path = path self.path = path
self.files: dict[str, Path | None] = { self.file_paths = self._detect_files()
"tokenizer.model": None, print(f"Found vocab files: {self.file_paths}")
"vocab.json": None,
"tokenizer.json": None,
}
self._detect_files()
def _detect_files(self): def _detect_files(self) -> dict[str, Path | None]:
for file in self.files.keys(): def locate(file: str) -> Path | None:
file_path = self.path / file if (path := self.path / file).exists():
parent_file_path = self.path.parent / file return path
if file_path.exists(): if (path := self.path.parent / file).exists():
self.files[file] = file_path return path
elif parent_file_path.exists(): return None
self.files[file] = parent_file_path
print(f"Found vocab files: {self.files}")
def _select_file(self, vocabtype: str | None) -> Path: return {vt: locate(f) for vt, f in self._FILES.items()}
if vocabtype in ["spm", "bpe"]:
for file_key in self.files.keys(): def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]:
if (file := self.files[file_key]) is not None: for vtype in vocab_types:
return file try:
raise FileNotFoundError(f"{vocabtype} vocab not found.") path = self.file_paths[vtype]
if vocabtype == "hfft": except KeyError:
# For Hugging Face Fast Tokenizer, return the directory path instead of a specific file raise ValueError(f"Unsupported vocabulary type {vtype}") from None
return self.path if path is not None:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") return vtype, path
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
load_merges = vocabtype == "bpe" load_merges = vocabtype == "bpe"
@ -1322,30 +1319,30 @@ class VocabFactory:
n_vocab=n_vocab, n_vocab=n_vocab,
) )
def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
path = self._select_file(vocabtype) vocab_type, path = self._select_file(vocab_types)
print(f"Loading vocab file '{path}', type '{vocabtype}'") print(f"Loading vocab file {path!r}, type {vocab_type!r}")
added_tokens_path = path.parent / "added_tokens.json" added_tokens_path = path.parent / "added_tokens.json"
vocab: Vocab vocab: Vocab
if vocabtype == "bpe": if vocab_type == "bpe":
vocab = BpeVocab( vocab = BpeVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocabtype == "spm": elif vocab_type == "spm":
vocab = SentencePieceVocab( vocab = SentencePieceVocab(
path, added_tokens_path if added_tokens_path.exists() else None path, added_tokens_path if added_tokens_path.exists() else None
) )
elif vocabtype == "hfft": elif vocab_type == "hfft":
vocab = HfVocab( vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None path.parent, added_tokens_path if added_tokens_path.exists() else None
) )
else: else:
raise ValueError(f"Unsupported vocabulary type {vocabtype}") raise ValueError(vocab_type)
# FIXME: Respect --vocab-dir? # FIXME: Respect --vocab-dir?
special_vocab = self._create_special_vocab( special_vocab = self._create_special_vocab(
vocab, vocab,
vocabtype, vocab_type,
model_parent_path, model_parent_path,
) )
return vocab, special_vocab return vocab, special_vocab
@ -1379,7 +1376,6 @@ def main(args_in: list[str] | None = None) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# We currently only support Q8_0 output on little endian systems. # We currently only support Q8_0 output on little endian systems.
output_choices.append("q8_0") output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
@ -1387,7 +1383,7 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") parser.add_argument("--vocab-type", help="The vocabulary format used to define the tokenizer model (default: spm,hfft)", default="spm,hfft")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None:
model_parent_path = model_plus.paths[0].parent model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
vocab_factory = VocabFactory(vocab_path) vocab_factory = VocabFactory(vocab_path)
vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path)
if args.vocab_only: if args.vocab_only:
if not args.outfile: if not args.outfile: