imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
This commit is contained in:
Georgi Gerganov 2024-06-06 16:30:58 +03:00 committed by GitHub
parent ad675e1c67
commit f83351f9a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 213 additions and 215 deletions

View File

@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
} catch (const std::invalid_argument & ex) { } catch (const std::invalid_argument & ex) {
fprintf(stderr, "%s\n", ex.what()); fprintf(stderr, "%s\n", ex.what());
params = params_org;
return false; return false;
} }
@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
return true; return true;
} }
if (arg == "--in-file") {
if (++i >= argc) {
invalid_param = true;
return true;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
return true;
}
params.in_files.push_back(argv[i]);
return true;
}
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true; return true;
} }
if (arg == "-v" || arg == "--verbose") { if (arg == "-v" || arg == "--verbose") {
params.verbose = true; params.verbosity = 1;
return true;
}
if (arg == "--verbosity") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.verbosity = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "--verbose-prompt") { if (arg == "--verbose-prompt") {
@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.i_pos = std::stoi(argv[i]); params.i_pos = std::stoi(argv[i]);
return true; return true;
} }
if (arg == "-o" || arg == "--output" || arg == "--output-file") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.out_file = argv[i];
return true;
}
if (arg == "-ofreq" || arg == "--output-frequency") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_out_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--save-frequency") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_save_freq = std::stoi(argv[i]);
return true;
}
if (arg == "--process-output") {
params.process_output = true;
return true;
}
if (arg == "--no-ppl") {
params.compute_ppl = false;
return true;
}
if (arg == "--chunk" || arg == "--from-chunk") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.i_chunk = std::stoi(argv[i]);
return true;
}
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters // Parse args for logging parameters
if (log_param_single_parse(argv[i])) { if (log_param_single_parse(argv[i])) {
@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-h, --help, --usage", "print usage and exit" }); options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
options.push_back({ "*", " --version", "show version and build info" }); options.push_back({ "*", " --version", "show version and build info" });
options.push_back({ "*", "-v, --verbose", "print verbose information" }); options.push_back({ "*", "-v, --verbose", "print verbose information" });
options.push_back({ "*", " --verbosity N", "set specific verbosity level (default: %d)", params.verbosity });
options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() }); options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" }); options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" }); options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
options.push_back({ "*", " --no-escape", "do not process escape sequences" }); options.push_back({ "*", " --no-escape", "do not process escape sequences" });
@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
options.push_back({ "imatrix" });
options.push_back({ "imatrix", "-o, --output FNAME", "output file (default: '%s')", params.out_file.c_str() });
options.push_back({ "imatrix", " --output-frequency N", "output the imatrix every N iterations (default: %d)", params.n_out_freq });
options.push_back({ "imatrix", " --save-frequency N", "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
options.push_back({ "imatrix", " --process-output", "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
options.push_back({ "imatrix", " --no-ppl", "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
options.push_back({ "imatrix", " --chunk N", "start processing the input from chunk N (default: %d)", params.i_chunk });
options.push_back({ "bench" }); options.push_back({ "bench" });
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });

View File

@ -56,43 +56,42 @@ struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = cpu_get_num_math(); int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1; int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1; int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_draft = 5; // number of tokens to draft during speculative decoding int32_t n_draft = 5; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_parallel = 1; // number of parallel sequences to decode
int32_t n_sequences = 1; // number of sequences to decode int32_t n_sequences = 1; // number of sequences to decode
float p_split = 0.1f; // speculative decoding split probability float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width.
int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor
int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width
int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
int32_t n_print = -1; // print token count every n tokens (-1 = disabled) float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr; ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr; void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
@ -114,7 +113,9 @@ struct gpt_params {
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides; std::vector<llama_model_kv_override> kv_overrides;
@ -124,23 +125,24 @@ struct gpt_params {
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting) // (which is more convenient to use for plotting)
// //
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed size_t winogrande_tasks = 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
bool kl_divergence = false; // compute KL divergence bool kl_divergence = false; // compute KL divergence
bool usage = false; // print usage bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
@ -163,7 +165,6 @@ struct gpt_params {
bool logits_all = false; // return logits for all tokens in the batch bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode bool infill = false; // use infill mode
@ -180,10 +181,10 @@ struct gpt_params {
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// server params // server params
int32_t port = 8080; int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; int32_t n_threads_http = -1; // number of threads to use for http server (-1 = use n_threads)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = "";
@ -219,6 +220,16 @@ struct gpt_params {
// passkey params // passkey params
int32_t n_junk = 250; // number of times to repeat the junk text int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
}; };
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);

View File

@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage ## Usage
``` ```
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>] ./imatrix \
[-ofreq num_chunks] [-ow <0 or 1>] [other common params] -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
``` ```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning: The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument For faster computation, make sure to use GPU offloading via the `-ngl` argument

View File

@ -17,39 +17,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}
struct Stats { struct Stats {
std::vector<float> values; std::vector<float> values;
std::vector<int> counts; std::vector<int> counts;
int ncall = 0; int ncall = 0;
}; };
struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false;
};
class IMatrixCollector { class IMatrixCollector {
public: public:
IMatrixCollector() = default; IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); } void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix() const; void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name, bool add); bool load_imatrix(const char * file_name);
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
private: private:
std::unordered_map<std::string, Stats> m_stats; std::unordered_map<std::string, Stats> m_stats;
StatParams m_params; gpt_params m_params;
std::mutex m_mutex; std::mutex m_mutex;
int m_last_call = 0; int m_last_call = 0;
std::vector<float> m_src1_data; std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
}; };
// remove any prefix and suffixes from the name // remove any prefix and suffixes from the name
@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false; if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)? // why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true; return true;
} }
@ -158,16 +156,16 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {
m_last_call = e.ncall; m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
keep_imatrix(m_last_call); save_imatrix(m_last_call);
} }
} }
} }
} else { } else {
auto& e = m_stats[wname]; auto & e = m_stats[wname];
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(src1->ne[0], 0); e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0); e.counts.resize(src1->ne[0], 0);
@ -189,11 +187,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
if (e.ncall > m_last_call) { if (e.ncall > m_last_call) {
m_last_call = e.ncall; m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) { if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix(); save_imatrix();
} }
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
keep_imatrix(m_last_call); save_imatrix(m_last_call);
} }
} }
} }
@ -201,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true; return true;
} }
void IMatrixCollector::save_imatrix() const { void IMatrixCollector::save_imatrix(int ncall) const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); auto fname = m_params.out_file;
} if (fname.empty()) {
fname = "imatrix.dat";
}
void IMatrixCollector::keep_imatrix(int ncall) const { if (ncall > 0) {
auto file_name = m_params.ofile; fname += ".at_";
if (file_name.empty()) file_name = "imatrix.dat"; fname += std::to_string(ncall);
file_name += ".at_"; }
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
}
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary); std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size(); int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries)); out.write((const char *) &n_entries, sizeof(n_entries));
@ -236,26 +232,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with // Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call)); out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize // Write the input filename at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset); {
out.write((const char *) &n_dataset, sizeof(n_dataset)); int len = m_params.prompt_file.size();
out.write(dataset, n_dataset); out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) { if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
} }
} }
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) { bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(imatrix_file, std::ios::binary); std::ifstream in(fname, std::ios::binary);
if (!in) { if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file); printf("%s: failed to open %s\n",__func__, fname);
return false; return false;
} }
int n_entries; int n_entries;
in.read((char*)&n_entries, sizeof(n_entries)); in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) { if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file); printf("%s: no data in file %s\n", __func__, fname);
return false; return false;
} }
for (int i = 0; i < n_entries; ++i) { for (int i = 0; i < n_entries; ++i) {
@ -263,23 +261,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector<char> name_as_vec(len+1); std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len); in.read((char *)name_as_vec.data(), len);
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false; return false;
} }
name_as_vec[len] = 0; name_as_vec[len] = 0;
std::string name{name_as_vec.data()}; std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)]; auto & e = m_stats[std::move(name)];
int ncall; int ncall;
in.read((char*)&ncall, sizeof(ncall)); in.read((char*)&ncall, sizeof(ncall));
int nval; int nval;
in.read((char *)&nval, sizeof(nval)); in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) { if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i); printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {}; m_stats = {};
return false; return false;
} }
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) { if (e.values.empty()) {
e.values.resize(nval, 0); e.values.resize(nval, 0);
e.counts.resize(nval, 0); e.counts.resize(nval, 0);
@ -289,7 +286,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float)); in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) { if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i); printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {}; m_stats = {};
return false; return false;
} }
@ -304,13 +301,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true; return true;
} }
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
if (!add) {
m_stats.clear();
}
return load_imatrix(file_name, m_stats);
}
static IMatrixCollector g_collector; static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -324,7 +314,7 @@ struct results_log_softmax {
float prob; float prob;
}; };
static std::vector<float> softmax(const std::vector<float>& logits) { static std::vector<float> softmax(const std::vector<float> & logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];
for (float v : logits) { for (float v : logits) {
@ -358,8 +348,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits( static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history double & nll, double & nll2, float * logit_history, float * prob_history) {
) {
std::mutex mutex; std::mutex mutex;
int counter = 0; int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -391,8 +380,7 @@ static void process_logits(
} }
} }
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
@ -405,13 +393,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now(); auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) { if (params.i_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false; return false;
} }
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
} }
if (int(tokens.size()) < 2*n_ctx) { if (int(tokens.size()) < 2*n_ctx) {
@ -424,7 +412,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector<float> logit_history; std::vector<float> logit_history;
std::vector<float> prob_history; std::vector<float> prob_history;
if (compute_ppl) { if (params.compute_ppl) {
logit_history.resize(tokens.size()); logit_history.resize(tokens.size());
prob_history.resize(tokens.size()); prob_history.resize(tokens.size());
} }
@ -446,7 +434,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch; const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits; std::vector<float> logits;
if (compute_ppl && num_batches > 1) { if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab); logits.reserve((size_t)n_ctx * n_vocab);
} }
@ -482,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS // restore the original token in case it was set to BOS
tokens[batch_start] = token_org; tokens[batch_start] = token_org;
if (compute_ppl && num_batches > 1) { if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx); const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
} }
@ -501,7 +489,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
} }
if (compute_ppl) { if (params.compute_ppl) {
const int first = n_ctx/2; const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@ -516,7 +504,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
} }
printf("\n"); printf("\n");
if (compute_ppl) { if (params.compute_ppl) {
nll2 /= count; nll2 /= count;
nll /= count; nll /= count;
const double ppl = exp(nll); const double ppl = exp(nll);
@ -533,109 +521,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
bool compute_ppl = true;
int from_chunk = 0;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else if (arg == "--continue-from") {
prev_result_file = argv[++iarg];
} else if (arg == "--combine") {
combine_files = argv[++iarg];
}
else if (arg == "--from-chunk") {
from_chunk = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
}
gpt_params params; gpt_params params;
params.n_batch = 512;
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) { if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params); print_usage(argc, argv, params);
return 1; return 1;
} }
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info(); g_collector.set_params(params);
if (params.seed == LLAMA_DEFAULT_SEED) { for (const auto & in_file : params.in_files) {
params.seed = time(NULL); printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
} if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
std::vector<std::string> files;
size_t pos = 0;
while (true) {
auto new_pos = combine_files.find(',', pos);
if (new_pos != std::string::npos) {
files.emplace_back(combine_files.substr(pos, new_pos - pos));
pos = new_pos + 1;
} else {
files.emplace_back(combine_files.substr(pos));
break;
}
}
if (files.size() < 2) {
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
return 1; return 1;
} }
printf("Combining the following %d files\n", int(files.size())); }
for (auto& file : files) {
printf(" %s\n", file.c_str()); if (params.in_files.size() > 1) {
if (!g_collector.load_imatrix(file.c_str(), true)) { printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
fprintf(stderr, "Failed to load %s\n", file.c_str());
return 1;
}
}
g_collector.save_imatrix(); g_collector.save_imatrix();
return 0;
}
if (!prev_result_file.empty()) {
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
return 1;
}
} }
llama_backend_init(); llama_backend_init();
@ -650,6 +561,7 @@ int main(int argc, char ** argv) {
// init // init
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) { if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__); fprintf(stderr, "%s : failed to init\n", __func__);
@ -668,8 +580,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); if (!compute_imatrix(ctx, params)) {
if (!OK) {
return 1; return 1;
} }

View File

@ -2360,7 +2360,7 @@ int main(int argc, char ** argv) {
// TODO: not great to use extern vars // TODO: not great to use extern vars
server_log_json = params.log_json; server_log_json = params.log_json;
server_verbose = params.verbose; server_verbose = params.verbosity > 0;
// struct that contains llama context and inference // struct that contains llama context and inference
server_context ctx_server; server_context ctx_server;