diff --git a/convert-gptneox-h5-to-gguf.py b/convert-gptneox-h5-to-gguf.py new file mode 100644 index 000000000..cbade7079 --- /dev/null +++ b/convert-gptneox-h5-to-gguf.py @@ -0,0 +1,173 @@ +# Quick and dirty HF gptneox--> gguf conversion + +import gguf +import sys +import struct +import json +import numpy as np +from typing import Any, List +from pathlib import Path +from transformers import AutoModelForCausalLM + + +if len(sys.argv) < 3: + print("Usage: convert-h5-to-ggml.py dir-model ftype\n") + print(" ftype == 0 -> float32") + print(" ftype == 1 -> float16") + sys.exit(1) + + +# output in the same directory as the model +dir_model = sys.argv[1] +fname_out = sys.argv[1] + "/ggml-model.bin" + + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 +# +# map from ftype to string +ftype_str = ["f32", "f16"] + +ftype = 1 +if len(sys.argv) > 2: + ftype = int(sys.argv[2]) + if ftype < 0 or ftype > 1: + print("Invalid ftype: " + str(ftype)) + sys.exit(1) + fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" + + +model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) +list_vars = model.state_dict() + +# count tensors to be converted +tensor_count = 0 +for name in list_vars.keys(): + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + tensor_count += 1 + +gguf_writer = gguf.GGUFWriter.open(fname_out) + +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +# This mmust be changed when adding/deleting kv +kv_count = 14 + +print("tensors " + str(tensor_count) + " kv " + str(kv_count)) + +print("write gguf header") + +gguf_writer.write_header(tensor_count, kv_count) + +print("write gguf hparams") + +llm_arch = "gptneox" + +gguf_writer.write_name("pythia-70b-deduped") +gguf_writer.write_description("gguf test model") +gguf_writer.write_architecture(llm_arch) +gguf_writer.write_context_length(llm_arch, hparams["max_position_embeddings"]) +gguf_writer.write_embedding_length(llm_arch, hparams["hidden_size"]) +gguf_writer.write_layer_count(llm_arch, hparams["num_hidden_layers"]) +gguf_writer.write_feed_forward_length(llm_arch, hparams["intermediate_size"]) +gguf_writer.write_rope_dimension_count(llm_arch, int( hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])) ) +gguf_writer.write_head_count(llm_arch, hparams["num_attention_heads"]) +gguf_writer.write_parallel_residual(llm_arch, hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) +gguf_writer.write_layer_norm_eps(llm_arch, hparams["layer_norm_eps"]) + +# TOKENIZATION + +print("write gguf tokenizer") + +tokens: List[str] = [] +merges: List[str] = [] + +if Path(dir_model + "/tokenizer.json").is_file(): + # vocab type gpt2 + print("Adding gpt2 tokenizer vocab") + + with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: + tokenizer = json.load(f) + + for key in tokenizer["model"]["vocab"]: + tokens.append(key) + + merges = tokenizer["model"]["merges"] + +gguf_writer.write_tokenizer_model("gpt2") +gguf_writer.write_token_list(tokens) +gguf_writer.write_token_merges(merges) + +# TENSORS + +# tensor info +print("write gguf tensor info") + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + continue + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if ftype != 0: + if name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + ftype_cur = 1 + else: + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + data = data.astype(np.float32) + ftype_cur = 0 + + gguf_writer.write_tensor_info(name, data) + + +# tensor data +print("write gguf tensor data") + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Process tensor: " + name + " with shape: ", data.shape) + + # we don't need these + if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): + print(" Skip tensor: " + name) + continue + + n_dims = len(data.shape) + + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype_cur = 0 + if ftype != 0: + if name.endswith(".weight") and n_dims == 2: + print(" Converting to float16") + data = data.astype(np.float16) + ftype_cur = 1 + else: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + else: + if data.dtype != np.float32: + print(" Converting to float32") + data = data.astype(np.float32) + ftype_cur = 0 + + gguf_writer.write_tensor(data) + +gguf_writer.close() + + +print("Done. Output file: " + fname_out) +print("") diff --git a/gptneox-common.cpp b/gptneox-common.cpp new file mode 100644 index 000000000..9dee0cb9c --- /dev/null +++ b/gptneox-common.cpp @@ -0,0 +1,601 @@ +#include "gptneox-common.h" + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +// Function to check if the next argument exists +std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { + if (i + 1 < argc && argv[i + 1][0] != '-') { + return argv[++i]; + } else { + fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } +} + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { + params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-p" || arg == "--prompt") { + params.prompt = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-n" || arg == "--n_predict") { + params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_k") { + params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--top_p") { + params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--temp") { + params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-last-n") { + params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "--repeat-penalty") { + params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-b" || arg == "--batch_size") { + params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-m" || arg == "--model") { + params.model = get_next_arg(i, argc, argv, arg, params); + } else if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + } else if (arg == "-ip" || arg == "--interactive-port") { + params.interactive = true; + params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); + } else if (arg == "-h" || arg == "--help") { + gpt_print_usage(argc, argv, params); + exit(0); + } else if (arg == "-f" || arg == "--file") { + get_next_arg(i, argc, argv, arg, params); + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + break; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + } else if (arg == "-tt" || arg == "--token_test") { + params.token_test = get_next_arg(i, argc, argv, arg, params); + } + else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + gpt_print_usage(argc, argv, params); + exit(0); + } + } + + return true; +} + +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + fprintf(stderr, "usage: %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "options:\n"); + fprintf(stderr, " -h, --help show this help message and exit\n"); + fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); + fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); + fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stderr, " prompt to start generation with (default: random)\n"); + fprintf(stderr, " -f FNAME, --file FNAME\n"); + fprintf(stderr, " load prompt from a file\n"); + fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); + fprintf(stderr, " test tokenization\n"); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); + fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); + fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stderr, " -m FNAME, --model FNAME\n"); + fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); + fprintf(stderr, "\n"); +} + +std::string gpt_random_prompt(std::mt19937 & rng) { + const int r = rng() % 10; + switch (r) { + case 0: return "So"; + case 1: return "Once upon a time"; + case 2: return "When"; + case 3: return "The"; + case 4: return "After"; + case 5: return "If"; + case 6: return "import"; + case 7: return "He"; + case 8: return "She"; + case 9: return "They"; + default: return "To"; + } + + return "The"; +} + +std::string trim(const std::string & s) { + std::regex e("^\\s+|\\s+$"); + return std::regex_replace(s, e, ""); +} + +std::string replace(const std::string & s, const std::string & from, const std::string & to) { + std::string result = s; + size_t pos = 0; + while ((pos = result.find(from, pos)) != std::string::npos) { + result.replace(pos, from.length(), to); + pos += to.length(); + } + return result; +} + +void gpt_vocab::add_special_token(const std::string & token) { + special_tokens.push_back(token); +} + +std::map json_parse(const std::string & fname) { + std::map result; + + // read file into string + std::string json; + { + std::ifstream ifs(fname); + if (!ifs) { + fprintf(stderr, "Failed to open %s\n", fname.c_str()); + exit(1); + } + + json = std::string((std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator())); + } + + if (json[0] != '{') { + return result; + } + + // parse json + { + bool has_key = false; + bool in_token = false; + + std::string str_key = ""; + std::string str_val = ""; + + int n = json.size(); + for (int i = 1; i < n; ++i) { + if (!in_token) { + if (json[i] == ' ') continue; + if (json[i] == '"') { + in_token = true; + continue; + } + } else { + if (json[i] == '\\' && i+1 < n) { + if (has_key == false) { + str_key += json[i]; + } else { + str_val += json[i]; + } + ++i; + } else if (json[i] == '"') { + if (has_key == false) { + has_key = true; + ++i; + while (json[i] == ' ') ++i; + ++i; // : + while (json[i] == ' ') ++i; + if (json[i] != '\"') { + while (json[i] != ',' && json[i] != '}') { + str_val += json[i++]; + } + has_key = false; + } else { + in_token = true; + continue; + } + } else { + has_key = false; + } + + str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space + str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line + str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> " + + try { + result[str_key] = std::stoi(str_val); + } catch (...) { + //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); + + } + str_key = ""; + str_val = ""; + in_token = false; + continue; + } + if (has_key == false) { + str_key += json[i]; + } else { + str_val += json[i]; + } + } + } + } + + return result; +} + +std::string convert_to_utf8(const std::wstring & input) { + std::wstring_convert> converter; + return converter.to_bytes(input); +} + + +std::wstring convert_to_wstring(const std::string & input) { + std::wstring_convert> converter; + return converter.from_bytes(input); +} + +void gpt_split_words(std::string str, std::vector& words) { + const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; + const std::regex re(pattern); + std::smatch m; + + while (std::regex_search(str, m, re)) { + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } +} + +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { + std::vector words; + + // first split the text into words + { + std::string str = text; + + // Generate the subpattern from the special_tokens vector if it's not empty + if (!vocab.special_tokens.empty()) { + const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])"); + std::string special_tokens_subpattern; + for (const auto & token : vocab.special_tokens) { + if (!special_tokens_subpattern.empty()) { + special_tokens_subpattern += "|"; + } + special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)"); + } + + std::regex re(special_tokens_subpattern); + std::smatch m; + // Split the text by special tokens. + while (std::regex_search(str, m, re)) { + // Split the substrings in-between special tokens into words. + gpt_split_words(m.prefix(), words); + // Add matched special tokens as words. + for (auto x : m) { + words.push_back(x); + } + str = m.suffix(); + } + // Remaining text without special tokens will be handled below. + } + + gpt_split_words(str, words); + } + + // find the longest token that forms each word in words: + std::vector tokens; + for (const auto & word : words) { + for (int i = 0; i < (int) word.size(); ){ + for (int j = word.size() - 1; j >= i; j--){ + auto cand = word.substr(i, j-i+1); + auto it = vocab.token_to_id.find(cand); + if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + tokens.push_back(it->second); + i = j + 1; + break; + } + else if (j == i){ // word.substr(i, 1) has no matching + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + i++; + } + } + } + } + + return tokens; +} + +std::vector parse_tokens_from_string(const std::string& input, char delimiter) { + std::vector output; + std::stringstream ss(input); + std::string token; + + while (std::getline(ss, token, delimiter)) { + output.push_back(std::stoi(token)); + } + + return output; +} + +std::map> extract_tests_from_file(const std::string & fpath_test){ + if (fpath_test.empty()){ + fprintf(stderr, "%s : No test file found.\n", __func__); + return std::map>(); + } + + std::map> tests; + + auto fin = std::ifstream(fpath_test, std::ios_base::in); + const char * delimeter = " => "; + const char del_tok = ','; + std::string line; + while (std::getline(fin, line)) { + size_t delimiterPos = line.find(delimeter); + if (delimiterPos != std::string::npos) { + std::string text = line.substr(0, delimiterPos); + std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter)); + tests[text] = parse_tokens_from_string(s_tokens, del_tok); + } + } + return tests; +} + +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){ + std::map> tests = extract_tests_from_file(fpath_test); + + size_t n_fails = 0; + + for (const auto & test : tests) { + std::vector tokens = gpt_tokenize(vocab, test.first); + + if (tokens != test.second){ + n_fails++; + + // print out failure cases + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str()); + fprintf(stderr, "%s : tokens in hf: ", __func__); + for (const auto & t : test.second) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : tokens in ggml: ", __func__); + for (const auto & t : tokens) { + fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); + } + fprintf(stderr, "\n"); + } + } + + fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size()); +} + +bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { + printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); + + vocab.token_to_id = ::json_parse(fname); + + for (const auto & kv : vocab.token_to_id) { + vocab.id_to_token[kv.second] = kv.first; + } + + printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); + + // print the vocabulary + //for (auto kv : vocab.token_to_id) { + // printf("'%s' -> %d\n", kv.first.data(), kv.second); + //} + + return true; +} + +gpt_vocab::id gpt_sample_top_k_top_p( + const gpt_vocab & vocab, + const float * logits, + int top_k, + double top_p, + double temp, + std::mt19937 & rng) { + int n_logits = vocab.id_to_token.size(); + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const double scale = 1.0/temp; + for (int i = 0; i < n_logits; ++i) { + logits_id.push_back(std::make_pair(logits[i]*scale, i)); + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + + //printf("\n"); + //for (int i = 0; i < (int) probs.size(); i++) { + // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); + //} + //exit(0); + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; +} + +gpt_vocab::id gpt_sample_top_k_top_p_repeat( + const gpt_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng) { + + int n_logits = vocab.id_to_token.size(); + + const auto * plogits = logits; + + const auto last_n_tokens = std::vector(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); + + if (temp <= 0) { + // select the token with the highest logit directly + float max_logit = plogits[0]; + gpt_vocab::id max_id = 0; + + for (int i = 1; i < n_logits; ++i) { + if (plogits[i] > max_logit) { + max_logit = plogits[i]; + max_id = i; + } + } + return max_id; + } + + + std::vector> logits_id; + logits_id.reserve(n_logits); + + { + const float scale = 1.0f/temp; + for (int i = 0; i < n_logits; ++i) { + // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) + // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main + if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { + // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability + if (plogits[i] < 0.0f) { + logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); + } + } else { + logits_id.push_back(std::make_pair(plogits[i]*scale, i)); + } + } + } + + // find the top K tokens + std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + }); + + logits_id.resize(top_k); + + double maxl = -INFINITY; + for (const auto & kv : logits_id) { + maxl = std::max(maxl, kv.first); + } + + // compute probs for the top K tokens + std::vector probs; + probs.reserve(logits_id.size()); + + double sum = 0.0; + for (const auto & kv : logits_id) { + double p = exp(kv.first - maxl); + probs.push_back(p); + sum += p; + } + + // normalize the probs + for (auto & p : probs) { + p /= sum; + } + + if (top_p < 1.0f) { + double cumsum = 0.0f; + for (int i = 0; i < top_k; i++) { + cumsum += probs[i]; + if (cumsum >= top_p) { + top_k = i + 1; + probs.resize(top_k); + logits_id.resize(top_k); + break; + } + } + + cumsum = 1.0/cumsum; + for (int i = 0; i < (int) probs.size(); i++) { + probs[i] *= cumsum; + } + } + +// printf("\n"); +// for (int i = 0; i < (int) probs.size(); i++) { +// for (int i = 0; i < 10; i++) { +// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); +// } + + std::discrete_distribution<> dist(probs.begin(), probs.end()); + int idx = dist(rng); + + return logits_id[idx].second; + +} diff --git a/gptneox-common.h b/gptneox-common.h new file mode 100644 index 000000000..60e5650c1 --- /dev/null +++ b/gptneox-common.h @@ -0,0 +1,125 @@ +// Various helper functions and utilities + +#pragma once + +#include +#include +#include +#include +#include + +// +// CLI argument parsing +// + +struct gpt_params { + int32_t seed = -1; // RNG seed + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + int32_t n_predict = 200; // new tokens to predict + int32_t n_batch = 8; // batch size for prompt processing + + // sampling parameters + int32_t top_k = 40; + float top_p = 0.9f; + float temp = 0.9f; + int32_t repeat_last_n = 64; + float repeat_penalty = 1.00f; + + std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path + std::string prompt = ""; + std::string token_test = ""; + + bool interactive = false; + int32_t interactive_port = -1; + + int32_t n_gpu_layers = 0; +}; + +bool gpt_params_parse(int argc, char ** argv, gpt_params & params); + +void gpt_print_usage(int argc, char ** argv, const gpt_params & params); + +std::string gpt_random_prompt(std::mt19937 & rng); + +// +// Vocab utils +// + +std::string trim(const std::string & s); + +std::string replace( + const std::string & s, + const std::string & from, + const std::string & to); + +struct gpt_vocab { + using id = int32_t; + using token = std::string; + + std::map token_to_id; + std::map id_to_token; + std::vector special_tokens; + + void add_special_token(const std::string & token); +}; + +// poor-man's JSON parsing +std::map json_parse(const std::string & fname); + +std::string convert_to_utf8(const std::wstring & input); + +std::wstring convert_to_wstring(const std::string & input); + +void gpt_split_words(std::string str, std::vector& words); + +// split text into tokens +// +// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 +// +// Regex (Python): +// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" +// +// Regex (C++): +// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" +// +std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); + +// test outputs of gpt_tokenize +// +// - compare with tokens generated by the huggingface tokenizer +// - test cases are chosen based on the model's main language (under 'prompt' directory) +// - if all sentences are tokenized identically, print 'All tests passed.' +// - otherwise, print sentence, huggingface tokens, ggml tokens +// +void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test); + +// load the tokens from encoder.json +bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); + +// sample next token given probabilities for each embedding +// +// - consider only the top K tokens +// - from them, consider only the top tokens with cumulative probability > P +// +// TODO: not sure if this implementation is correct +// TODO: temperature is not implemented +// +gpt_vocab::id gpt_sample_top_k_top_p( + const gpt_vocab & vocab, + const float * logits, + int top_k, + double top_p, + double temp, + std::mt19937 & rng); + +gpt_vocab::id gpt_sample_top_k_top_p_repeat( + const gpt_vocab & vocab, + const float * logits, + const int32_t * last_n_tokens_data, + size_t last_n_tokens_data_size, + int top_k, + double top_p, + double temp, + int repeat_last_n, + float repeat_penalty, + std::mt19937 & rng);