common : move arg parser code to arg.cpp (#9388)

* common : move arg parser to arg.cpp

* better categorize args

* add cmake

* missing climits

* missing cstdarg

* common : more explicit includes

* fix build

* refactor gpt_params_parse

* update server readme

* fix test

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan Son Nguyen 2024-09-09 23:36:09 +02:00 committed by GitHub
parent 293bebe077
commit bfe76d4a17
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
36 changed files with 2281 additions and 2210 deletions

View File

@ -925,6 +925,7 @@ OBJ_LLAMA = \
OBJ_COMMON = \ OBJ_COMMON = \
common/common.o \ common/common.o \
common/arg.o \
common/console.o \ common/console.o \
common/ngram-cache.o \ common/ngram-cache.o \
common/sampling.o \ common/sampling.o \
@ -1157,6 +1158,11 @@ common/common.o: \
include/llama.h include/llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
common/arg.o: \
common/arg.cpp \
common/arg.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common/sampling.o: \ common/sampling.o: \
common/sampling.cpp \ common/sampling.cpp \
common/sampling.h \ common/sampling.h \

View File

@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
base64.hpp base64.hpp
common.h common.h
common.cpp common.cpp
arg.h
arg.cpp
sampling.h sampling.h
sampling.cpp sampling.cpp
console.h console.h

1994
common/arg.cpp Normal file

File diff suppressed because it is too large Load Diff

77
common/arg.h Normal file
View File

@ -0,0 +1,77 @@
#pragma once
#include "common.h"
#include <set>
#include <string>
#include <vector>
//
// CLI argument parsing
//
struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
bool is_sparam = false; // is current arg a sampling param?
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
llama_arg & set_env(const char * env);
llama_arg & set_sparam();
bool in_example(enum llama_example ex);
bool get_value_from_env(std::string & output);
bool has_value_from_env();
std::string to_string();
};
struct gpt_params_context {
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
gpt_params & params;
std::vector<llama_arg> options;
void(*print_usage)(int, char **) = nullptr;
gpt_params_context(gpt_params & params) : params(params) {}
};
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// function to be used by test-arg-parser
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);

File diff suppressed because it is too large Load Diff

View File

@ -4,20 +4,11 @@
#include "llama.h" #include "llama.h"
#include "sampling.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <random>
#include <thread>
#include <set>
#include <unordered_map>
#include <tuple>
#include <functional>
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
// CPU utils // CPU utils
// //
struct cpu_params {
int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
};
int32_t cpu_get_num_physical_cores(); int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math(); int32_t cpu_get_num_math();
// //
// CLI argument parsing // Common params
// //
enum llama_example { enum llama_example {
@ -78,28 +78,71 @@ enum llama_example {
LLAMA_EXAMPLE_CVECTOR_GENERATOR, LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT, LLAMA_EXAMPLE_COUNT,
}; };
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// dimensionality reduction methods, used by cvector-generator // dimensionality reduction methods, used by cvector-generator
enum dimre_method { enum dimre_method {
DIMRE_METHOD_PCA, DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN, DIMRE_METHOD_MEAN,
}; };
struct cpu_params { // sampler parameters
int n_threads = -1; struct gpt_sampler_params {
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
bool mask_valid = false; // Default: any CPU
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime) int32_t n_prev = 64; // number of previous tokens to remember
bool strict_cpu = false; // Use strict CPU placement int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
}; };
struct gpt_params { struct gpt_params {
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
@ -143,23 +186,23 @@ struct gpt_params {
struct gpt_sampler_params sparams; struct gpt_sampler_params sparams;
std::string model = ""; // model path std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding // NOLINT
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; std::string prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -189,7 +232,6 @@ struct gpt_params {
bool kl_divergence = false; // compute KL divergence bool kl_divergence = false; // compute KL divergence
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
bool usage = false; // print usage bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool special = false; // enable special token output bool special = false; // enable special token output
@ -220,7 +262,7 @@ struct gpt_params {
std::string cache_type_v = "f16"; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding
@ -236,15 +278,15 @@ struct gpt_params {
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = ""; // NOLINT
std::string chat_template = ""; std::string chat_template = ""; // NOLINT
std::string system_prompt = ""; std::string system_prompt = ""; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string ssl_file_key = ""; std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; std::string ssl_file_cert = ""; // NOLINT
bool endpoint_slots = true; bool endpoint_slots = true;
bool endpoint_metrics = false; bool endpoint_metrics = false;
@ -299,92 +341,6 @@ struct gpt_params {
bool batched_bench_output_jsonl = false; bool batched_bench_output_jsonl = false;
}; };
struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args;
const char * value_hint = nullptr; // help text or example for arg value
const char * value_hint_2 = nullptr; // for second arg value
const char * env = nullptr;
std::string help;
void (*handler_void) (gpt_params & params) = nullptr;
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
void (*handler_int) (gpt_params & params, int) = nullptr;
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &)
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const std::string & help,
void (*handler)(gpt_params & params, int)
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
llama_arg(
const std::initializer_list<const char *> & args,
const std::string & help,
void (*handler)(gpt_params & params)
) : args(args), help(help), handler_void(handler) {}
// support 2 values for arg
llama_arg(
const std::initializer_list<const char *> & args,
const char * value_hint,
const char * value_hint_2,
const std::string & help,
void (*handler)(gpt_params & params, const std::string &, const std::string &)
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
this->examples = std::move(examples);
return *this;
}
llama_arg & set_env(const char * env) {
help = help + "\n(env: " + env + ")";
this->env = env;
return *this;
}
bool in_example(enum llama_example ex) {
return examples.find(ex) != examples.end();
}
bool get_value_from_env(std::string & output) const {
if (env == nullptr) return false;
char * value = std::getenv(env);
if (value) {
output = value;
return true;
}
return false;
}
bool has_value_from_env() const {
return env != nullptr && std::getenv(env);
}
std::string to_string();
};
// initialize list of options (arguments) that can be used by the current example
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
// optionally, we can provide "print_usage" to print example usage
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
// parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
std::string gpt_params_get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);

View File

@ -2,6 +2,9 @@
#include "common.h" #include "common.h"
#include <cmath>
#include <unordered_map>
// the ring buffer works similarly to std::deque, but with a fixed capacity // the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h // TODO: deduplicate with llama-impl.h
template<typename T> template<typename T>
@ -420,7 +423,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
} }
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) { std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
std::unordered_map<char, gpt_sampler_type> sampler_name_map { std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P }, { gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },

View File

@ -2,61 +2,11 @@
#include "llama.h" #include "llama.h"
#include "common.h"
#include <string> #include <string>
#include <vector> #include <vector>
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// sampling parameters
struct gpt_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
};
// gpt_sampler extends llama_sampler with additional functionality: // gpt_sampler extends llama_sampler with additional functionality:
// //
// - grammar support // - grammar support

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
params.n_predict = 32; params.n_predict = 32;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -12,12 +12,9 @@
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <random>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5 #define DEBUG_POS 5

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include <fstream> #include <fstream>
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc); std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, ex); auto ctx_arg = gpt_params_parser_init(params, ex);
file << "| Argument | Explanation |\n"; file << "| Argument | Explanation |\n";
file << "| -------- | ----------- |\n"; file << "| -------- | ----------- |\n";
for (auto & opt : options) { for (auto & opt : ctx_arg.options) {
file << "| `"; file << "| `";
// args // args
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -153,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) { int main(int argc, char * argv[]) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
params.logits_all = true; params.logits_all = true;
params.verbosity = 1; params.verbosity = 1;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_IMATRIX, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,6 +1,7 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,11 +1,12 @@
#include "ggml.h" #include "arg.h"
#include "base64.hpp"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "base64.hpp"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,9 +1,11 @@
#include "ggml.h" #include "arg.h"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>
@ -36,8 +38,7 @@ struct ngram_container {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,7 +1,8 @@
#include "ggml.h" #include "arg.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "ggml.h"
#include "llama.h"
#include <cstdint> #include <cstdint>
#include <fstream> #include <fstream>
@ -13,8 +14,7 @@
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
return 0;
} }

View File

@ -1,8 +1,9 @@
#include "ggml.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h"
#include "log.h" #include "log.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "llama.h"
#include "ggml.h"
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>
@ -15,8 +16,7 @@
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,7 +1,9 @@
#include "arg.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "sampling.h"
#include "llama.h"
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>
@ -12,8 +14,7 @@
int main(int argc, char ** argv){ int main(int argc, char ** argv){
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,6 +1,7 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
g_params = &params; g_params = &params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,7 +1,9 @@
// A basic application simulating a server with multiple clients. // A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel. // The clients submit requests to the server and they are processed in parallel.
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
params.n_keep = 32; params.n_keep = 32;
params.i_pos = -1; params.i_pos = -1;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,18 +1,19 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include <array>
#include <atomic>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <fstream>
#include <mutex>
#include <random>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <mutex>
#include <atomic>
#include <vector> #include <vector>
#include <array>
#include <fstream>
#include <sstream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
params.n_ctx = 512; params.n_ctx = 512;
params.logits_all = true; params.logits_all = true;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
params.prompt = "The quick brown fox"; params.prompt = "The quick brown fox";
params.sparams.seed = 1234; params.sparams.seed = 1234;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
| `--version` | show version and build info | | `--version` | show version and build info |
| `-v, --verbose` | print verbose information | | `-v, --verbose` | print verbose information |
| `--verbosity N` | set specific verbosity level (default: 0) | | `--verbosity N` | set specific verbosity level (default: 0) |
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
| `--no-display-prompt` | don't print prompt at generation (default: false) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) | | `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) | | `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") | | `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask | | `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> | | `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> | | `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) | | `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch | | `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) | | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) | | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) | | `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) | | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) | | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) | | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) | | `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) | | `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
| `-p, --prompt PROMPT` | prompt to start generation with | | `-p, --prompt PROMPT` | prompt to start generation with |
| `-f, --file FNAME` | a file containing the prompt (default: none) | | `-f, --file FNAME` | a file containing the prompt (default: none) |
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) | | `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--no-escape` | do not process escape sequences | | `--no-escape` | do not process escape sequences |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) | | `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
| `--penalize-nl` | penalize newline tokens (default: false) | | `--penalize-nl` | penalize newline tokens (default: false) |
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) | | `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) | | `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing | | `--mlock` | force system to keep model in RAM rather than swapping or compressing |
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 | | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) | | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs | | `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 | | `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) | | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors | | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors | | `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive | | `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) | | `-a, --alias STRING` | set alias for model name (to be used by REST API) |
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) | | `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) | | `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) | | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
| `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--api-key-file FNAME` | path to file containing API keys (default: none) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
| `--timeout N` | server read/write timeout in seconds (default: 600) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications | | `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
| `--log-format {text, json}` | log output format: json or text (default: json) | | `--log-format {text, json}` | log output format: json or text (default: json) |

View File

@ -1,6 +1,8 @@
#include "utils.hpp" #include "utils.hpp"
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"
@ -2423,8 +2425,7 @@ int main(int argc, char ** argv) {
// own arguments required by this example // own arguments required by this example
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
params.n_predict = 32; params.n_predict = 32;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,11 +1,13 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
#include <set> #include <set>
#include <random>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
@ -27,8 +29,7 @@ struct seq_draft {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE); if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
if (!gpt_params_parse(argc, argv, params, options)) {
return 1; return 1;
} }

View File

@ -1,3 +1,6 @@
#include "arg.h"
#include "common.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream> #include <sstream>
@ -6,18 +9,16 @@
#undef NDEBUG #undef NDEBUG
#include <cassert> #include <cassert>
#include "common.h"
int main(void) { int main(void) {
gpt_params params; gpt_params params;
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) { for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
try { try {
auto options = gpt_params_parser_init(params, (enum llama_example)ex); auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
std::unordered_set<std::string> seen_args; std::unordered_set<std::string> seen_args;
std::unordered_set<std::string> seen_env_vars; std::unordered_set<std::string> seen_env_vars;
for (const auto & opt : options) { for (const auto & opt : ctx_arg.options) {
// check for args duplications // check for args duplications
for (const auto & arg : opt.args) { for (const auto & arg : opt.args) {
if (seen_args.find(arg) == seen_args.end()) { if (seen_args.find(arg) == seen_args.end()) {
@ -52,40 +53,51 @@ int main(void) {
}; };
std::vector<std::string> argv; std::vector<std::string> argv;
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
printf("test-arg-parser: test invalid usage\n\n"); printf("test-arg-parser: test invalid usage\n\n");
// missing value
argv = {"binary_name", "-m"}; argv = {"binary_name", "-m"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (int)
argv = {"binary_name", "-ngl", "hello"}; argv = {"binary_name", "-ngl", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// wrong value (enum)
argv = {"binary_name", "-sm", "hello"}; argv = {"binary_name", "-sm", "hello"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
argv = {"binary_name", "--draft", "123"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
printf("test-arg-parser: test valid usage\n\n"); printf("test-arg-parser: test valid usage\n\n");
argv = {"binary_name", "-m", "model_file.gguf"}; argv = {"binary_name", "-m", "model_file.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "model_file.gguf"); assert(params.model == "model_file.gguf");
argv = {"binary_name", "-t", "1234"}; argv = {"binary_name", "-t", "1234"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.cpuparams.n_threads == 1234); assert(params.cpuparams.n_threads == 1234);
argv = {"binary_name", "--verbose"}; argv = {"binary_name", "--verbose"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.verbosity == 1); assert(params.verbosity == 1);
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "abc.gguf"); assert(params.model == "abc.gguf");
assert(params.n_predict == 6789); assert(params.n_predict == 6789);
assert(params.n_batch == 9090); assert(params.n_batch == 9090);
// --draft cannot be used outside llama-speculative
argv = {"binary_name", "--draft", "123"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
assert(params.n_draft == 123);
// skip this part on windows, because setenv is not supported // skip this part on windows, because setenv is not supported
#ifdef _WIN32 #ifdef _WIN32
printf("test-arg-parser: skip on windows build\n"); printf("test-arg-parser: skip on windows build\n");
@ -94,12 +106,12 @@ int main(void) {
setenv("LLAMA_ARG_THREADS", "blah", true); setenv("LLAMA_ARG_THREADS", "blah", true);
argv = {"binary_name"}; argv = {"binary_name"};
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true); setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name"}; argv = {"binary_name"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "blah.gguf"); assert(params.model == "blah.gguf");
assert(params.cpuparams.n_threads == 1010); assert(params.cpuparams.n_threads == 1010);
@ -109,7 +121,7 @@ int main(void) {
setenv("LLAMA_ARG_MODEL", "blah.gguf", true); setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
setenv("LLAMA_ARG_THREADS", "1010", true); setenv("LLAMA_ARG_THREADS", "1010", true);
argv = {"binary_name", "-m", "overwritten.gguf"}; argv = {"binary_name", "-m", "overwritten.gguf"};
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options)); assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
assert(params.model == "overwritten.gguf"); assert(params.model == "overwritten.gguf");
assert(params.cpuparams.n_threads == 1010); assert(params.cpuparams.n_threads == 1010);
#endif // _WIN32 #endif // _WIN32