mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-11 13:30:35 +00:00
py : logging and flake8 suppression refactoring (#7081)
Set one as executable and add basicConfig() to another. Also added noqa tag to test scripts.
This commit is contained in:
parent
842500144e
commit
6fbd432211
15
.flake8
15
.flake8
@ -1,4 +1,17 @@
|
|||||||
[flake8]
|
[flake8]
|
||||||
max-line-length = 125
|
max-line-length = 125
|
||||||
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
|
||||||
exclude = examples/*,examples/*/**,*/**/__init__.py,scripts/gen-unicode-data.py,tests/test-tokenizer-0.py
|
exclude =
|
||||||
|
# Do not traverse examples
|
||||||
|
examples,
|
||||||
|
# Do not include package initializers
|
||||||
|
__init__.py,
|
||||||
|
# No need to traverse our git directory
|
||||||
|
.git,
|
||||||
|
# There's no value in checking cache directories
|
||||||
|
__pycache__,
|
||||||
|
# No need to include the build path
|
||||||
|
build,
|
||||||
|
# This contains builds that we don't want to check
|
||||||
|
dist # This is generated with `python build .` for package releases
|
||||||
|
# max-complexity = 10
|
||||||
|
2
convert-hf-to-gguf-update.py
Normal file → Executable file
2
convert-hf-to-gguf-update.py
Normal file → Executable file
@ -1,3 +1,5 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
# This script downloads the tokenizer models of the specified models from Huggingface and
|
# This script downloads the tokenizer models of the specified models from Huggingface and
|
||||||
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
|
||||||
#
|
#
|
||||||
|
@ -16,6 +16,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
|||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
logger = logging.getLogger("lora-to-gguf")
|
logger = logging.getLogger("lora-to-gguf")
|
||||||
|
|
||||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||||
|
@ -41,20 +41,20 @@ def get_matches(regex_expr):
|
|||||||
|
|
||||||
|
|
||||||
def print_cat(cat, ranges):
|
def print_cat(cat, ranges):
|
||||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
|
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||||
cnt = 0
|
cnt = 0
|
||||||
for start, end in ranges:
|
for start, end in ranges:
|
||||||
if cnt % 4 != 0:
|
if cnt % 4 != 0:
|
||||||
print(" ", end="")
|
print(" ", end="") # noqa: NP100
|
||||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
|
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
|
||||||
if cnt % 4 == 3:
|
if cnt % 4 == 3:
|
||||||
print("")
|
print("") # noqa: NP100
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
if cnt % 4 != 0:
|
if cnt % 4 != 0:
|
||||||
print("")
|
print("") # noqa: NP100
|
||||||
print("};")
|
print("};") # noqa: NP100
|
||||||
print("")
|
print("") # noqa: NP100
|
||||||
|
|
||||||
|
|
||||||
print_cat("number", get_matches(r'\p{N}'))
|
print_cat("number", get_matches(r'\p{N}'))
|
||||||
|
@ -13,7 +13,7 @@ fname_tok = args.fname_tok
|
|||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||||
|
|
||||||
print('tokenizing file: ', fname_tok)
|
print('tokenizing file: ', fname_tok) # noqa: NP100
|
||||||
fname_out = fname_tok + '.tok'
|
fname_out = fname_tok + '.tok'
|
||||||
with open(fname_tok, 'r', encoding='utf-8') as f:
|
with open(fname_tok, 'r', encoding='utf-8') as f:
|
||||||
lines = f.readlines()
|
lines = f.readlines()
|
||||||
@ -21,7 +21,7 @@ with open(fname_tok, 'r', encoding='utf-8') as f:
|
|||||||
t_start = time.time()
|
t_start = time.time()
|
||||||
res = tokenizer.encode(s, add_special_tokens=False)
|
res = tokenizer.encode(s, add_special_tokens=False)
|
||||||
t_end = time.time()
|
t_end = time.time()
|
||||||
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)')
|
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
|
||||||
with open(fname_out, 'w', encoding='utf-8') as f:
|
with open(fname_out, 'w', encoding='utf-8') as f:
|
||||||
for x in res:
|
for x in res:
|
||||||
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
# LLaMA v3 for some reason strips the space for these tokens (and others)
|
||||||
@ -41,6 +41,6 @@ with open(fname_tok, 'r', encoding='utf-8') as f:
|
|||||||
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
|
||||||
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
|
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
|
||||||
f.write(str(x) + '\n')
|
f.write(str(x) + '\n')
|
||||||
print('len(res): ', len(res))
|
print('len(res): ', len(res)) # noqa: NP100
|
||||||
print('len(lines): ', len(lines))
|
print('len(lines): ', len(lines)) # noqa: NP100
|
||||||
print('results written to: ', fname_out)
|
print('results written to: ', fname_out) # noqa: NP100
|
||||||
|
Loading…
Reference in New Issue
Block a user