Compare commits

...

5 Commits

Author SHA1 Message Date
Xuan Son Nguyen
4813080d7c
Merge 242135eca4 into 9a483999a6 2025-01-12 13:45:26 +01:00
Xuan Son Nguyen
9a483999a6
llama : fix chat template gguf key (#11201) 2025-01-12 13:45:14 +01:00
Xuan Son Nguyen
242135eca4 various fixes 2025-01-11 21:35:10 +01:00
Xuan Son Nguyen
ef089ca105 fix build 2025-01-11 20:35:10 +01:00
Xuan Son Nguyen
803031665a common : support tag-based hf_repo like on ollama 2025-01-11 19:44:12 +01:00
4 changed files with 136 additions and 31 deletions

View File

@ -128,18 +128,105 @@ std::string common_arg::to_string() {
// utils // utils
// //
#if defined(LLAMA_USE_CURL)
/**
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
* - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
* Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
* Return pair of <repo, file> (with "repo" already having tag removed)
*/
static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
std::string tag = parts.size() > 1 ? parts.back() : "latest";
std::string hf_repo = parts[0];
if (string_split<std::string>(hf_repo, '/').size() != 2) {
throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
}
// fetch model info from Hugging Face Hub API
json model_info;
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
std::string res_str;
std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
#if defined(_WIN32)
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
#endif
if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer " + hf_token;
http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
}
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
throw std::runtime_error("error: cannot make GET request to HF API");
}
long res_code;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
if (res_code == 200) {
model_info = json::parse(res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
// check response
if (!model_info.contains("ggufFile")) {
throw std::runtime_error("error: model does not have ggufFile");
}
json & gguf_file = model_info.at("ggufFile");
if (!gguf_file.contains("rfilename")) {
throw std::runtime_error("error: ggufFile does not have rfilename");
}
return std::make_pair(hf_repo, gguf_file.at("rfilename"));
}
#else
static std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
throw std::runtime_error("error: llama.cpp built without libcurl");
}
#endif
static void common_params_handle_model_default( static void common_params_handle_model_default(
std::string & model, std::string & model,
std::string & model_url, const std::string & model_url,
std::string & hf_repo, std::string & hf_repo,
std::string & hf_file) { std::string & hf_file,
const std::string & hf_token) {
if (!hf_repo.empty()) { if (!hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model // short-hand to avoid specifying --hf-file -> default it to --model
if (hf_file.empty()) { if (hf_file.empty()) {
if (model.empty()) { if (model.empty()) {
throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); try {
auto auto_detected = common_get_hf_file(hf_repo, hf_token);
hf_repo = auto_detected.first;
hf_file = auto_detected.second;
printf("%s: using hf_file = %s\n", __func__, hf_file.c_str());
} catch (std::exception & e) {
fprintf(stderr, "%s: %s\n", __func__, e.what());
exit(1);
} }
} else {
hf_file = model; hf_file = model;
}
} else if (model.empty()) { } else if (model.empty()) {
// this is to avoid different repo having same file name, or same file name in different subdirs // this is to avoid different repo having same file name, or same file name in different subdirs
std::string filename = hf_repo + "_" + hf_file; std::string filename = hf_repo + "_" + hf_file;
@ -290,8 +377,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
} }
// TODO: refactor model params in a common struct // TODO: refactor model params in a common struct
common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token);
common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
if (params.escape) { if (params.escape) {
string_process_escapes(params.prompt); string_process_escapes(params.prompt);
@ -1583,21 +1670,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
} }
).set_env("LLAMA_ARG_MODEL_URL")); ).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg( add_opt(common_arg(
{"-hfr", "--hf-repo"}, "REPO", {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository (default: unused)", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
"example: unsloth/phi-4-GGUF:q4_k_m\n"
"(default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_repo = value; params.hf_repo = value;
} }
).set_env("LLAMA_ARG_HF_REPO")); ).set_env("LLAMA_ARG_HF_REPO"));
add_opt(common_arg( add_opt(common_arg(
{"-hff", "--hf-file"}, "FILE", {"-hff", "--hf-file"}, "FILE",
"Hugging Face model file (default: unused)", "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.hf_file = value; params.hf_file = value;
} }
).set_env("LLAMA_ARG_HF_FILE")); ).set_env("LLAMA_ARG_HF_FILE"));
add_opt(common_arg( add_opt(common_arg(
{"-hfrv", "--hf-repo-v"}, "REPO", {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
"Hugging Face model repository for the vocoder model (default: unused)", "Hugging Face model repository for the vocoder model (default: unused)",
[](common_params & params, const std::string & value) { [](common_params & params, const std::string & value) {
params.vocoder.hf_repo = value; params.vocoder.hf_repo = value;

View File

@ -52,11 +52,6 @@
#include <sys/stat.h> #include <sys/stat.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -1130,7 +1125,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
// Initialize libcurl // Initialize libcurl
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup); curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
curl_slist_ptr http_headers;
if (!curl) { if (!curl) {
LOG_ERR("%s: error initializing libcurl\n", __func__); LOG_ERR("%s: error initializing libcurl\n", __func__);
return false; return false;
@ -1144,11 +1140,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
// Check if hf-token or bearer-token was specified // Check if hf-token or bearer-token was specified
if (!hf_token.empty()) { if (!hf_token.empty()) {
std::string auth_header = "Authorization: Bearer "; std::string auth_header = "Authorization: Bearer " + hf_token;
auth_header += hf_token.c_str(); http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
struct curl_slist *http_headers = NULL; curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
http_headers = curl_slist_append(http_headers, auth_header.c_str());
curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
} }
#if defined(_WIN32) #if defined(_WIN32)
@ -1636,15 +1630,8 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
// //
std::string common_get_builtin_chat_template(const struct llama_model * model) { std::string common_get_builtin_chat_template(const struct llama_model * model) {
static const char * template_key = "tokenizer.chat_template"; const char * ptr_tmpl = llama_model_chat_template(model);
// call with NULL buffer to get the total size of the string return ptr_tmpl == nullptr ? "" : ptr_tmpl;
int32_t res = llama_model_meta_val_str(model, template_key, NULL, 0);
if (res > 0) {
std::vector<char> model_template(res + 1, 0);
llama_model_meta_val_str(model, template_key, model_template.data(), model_template.size());
return std::string(model_template.data(), model_template.size() - 1);
}
return "";
} }
bool common_chat_verify_template(const std::string & tmpl) { bool common_chat_verify_template(const std::string & tmpl) {

View File

@ -8,6 +8,12 @@
#include <vector> #include <vector>
#include <sstream> #include <sstream>
#if defined(LLAMA_USE_CURL)
#include <curl/curl.h>
#include <curl/easy.h>
#include <future>
#endif
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
#else #else
@ -454,6 +460,11 @@ static bool string_starts_with(const std::string & str,
return str.rfind(prefix, 0) == 0; return str.rfind(prefix, 0) == 0;
} }
static bool string_ends_with(const std::string & str,
const std::string & suffix) { // While we wait for C++20's std::string::ends_with...
return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input); void string_process_escapes(std::string & input);
@ -656,4 +667,22 @@ const char * const LLM_KV_SPLIT_NO = "split.no";
const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_COUNT = "split.count";
const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
#if defined(LLAMA_USE_CURL)
//
// CURL utils
//
using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
struct curl_slist_ptr {
struct curl_slist * ptr = nullptr;
~curl_slist_ptr() {
if (ptr) {
curl_slist_free_all(ptr);
}
}
};
#endif
} }

View File

@ -178,7 +178,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" },
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat.template" }, { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },