llama : add ability to cancel model loading (#4462)

* llama : Add ability to cancel model load Updated llama_progress_callback so that if it returns false, the model loading is aborted. * llama : Add test for model load cancellation * Fix bool return in llama_model_load, remove std::ignore use * Update llama.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Fail test if model file is missing * Revert "Fail test if model file is missing" This reverts commit 32ebd525bf. * Add test-model-load-cancel to Makefile * Revert "Revert "Fail test if model file is missing"" This reverts commit 2796953257. * Simplify .gitignore for tests, clang-tidy fixes * Label all ctest tests * ci : ctest uses -L main * Attempt at writing ctest_with_model * ci : get ci/run.sh working with test-model-load-cancel * ci : restrict .github/workflows/build.yml ctest to -L main * update requirements.txt * Disable test-model-load-cancel in make * Remove venv before creation * Restructure requirements.txt Top-level now imports the specific additional requirements for each python file. Using `pip install -r requirements.txt` will fail if versions become mismatched in the per-file requirements. * Make per-python-script requirements work alone This doesn't break the main requirements.txt. * Add comment * Add convert-persimmon-to-gguf.py to new requirements.txt scheme * Add check-requirements.sh script and GitHub workflow * Remove shellcheck installation step from workflow * Add nocleanup special arg * Fix merge see: https://github.com/ggerganov/llama.cpp/pull/4462#discussion_r1434593573 * reset to upstream/master * Redo changes for cancelling model load --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
2024-12-26 11:24:35 +00:00 · 2023-12-22 01:19:36 -05:00 · 2023-12-22 01:19:36 -05:00 · c7e9701f86
commit c7e9701f86
parent afefa319f1
2 changed files with 37 additions and 15 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2372,7 +2372,8 @@ struct llama_model_loader {
        }
    }
-    void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
+    // Returns false if cancelled by progress_callback
    bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
        size_t size_data = 0;
        for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
@ -2404,7 +2405,9 @@ struct llama_model_loader {
            GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
            if (progress_callback) {
-                progress_callback((float) size_done / size_data, progress_callback_user_data);
+                if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
                    return false;
                }
            }
            const size_t offs = file_offset(ggml_get_name(cur));
@ -2466,8 +2469,11 @@ struct llama_model_loader {
        }
        if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
+            // Even though the model is done loading, we still honor
            // cancellation since we need to free allocations.
            return progress_callback(1.0f, progress_callback_user_data);
        }
        return true;
    }
 };
@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
    if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    }
 }
-static void llm_load_tensors(
+// Returns false if cancelled by progress_callback
 static bool llm_load_tensors(
        llama_model_loader & ml,
        llama_model & model,
        int n_gpu_layers,
@ -3722,16 +3729,20 @@ static void llm_load_tensors(
        model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
    }
-    ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL);
+    if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
        return false;
    }
    model.mapping = std::move(ml.mapping);
    // loading time will be recalculate after the first eval, so
    // we take page faults deferred by mmap() into consideration
    model.t_load_us = ggml_time_us() - model.t_start_us;
    return true;
 }
-static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
+// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
    try {
        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
        if (params.vocab_only) {
            LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
-            return true;
+            return 0;
        }
-        llm_load_tensors(
+        if (!llm_load_tensors(
            ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
            params.progress_callback, params.progress_callback_user_data
-        );
+        )) {
            return -2;
        }
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
-        return false;
+        return -1;
    }
-    return true;
+    return 0;
 }
 //
@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file(
                    LLAMA_LOG_INFO("\n");
                }
            }
            return true;
        };
    }
-    if (!llama_model_load(path_model, *model, params)) {
+    int status = llama_model_load(path_model, *model, params);
    GGML_ASSERT(status <= 0);
    if (status < 0) {
        if (status == -1) {
            LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
        } else if (status == -2) {
            LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
        }
        delete model;
        return nullptr;
    }
--- a/llama.h
+++ b/llama.h
@ -127,7 +127,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void *ctx);
    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -180,7 +180,9 @@ extern "C" {
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
-        // called with a progress value between 0 and 1, pass NULL to disable
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
        // If the provided progress_callback returns true, model loading continues.
        // If it returns false, model loading is immediately aborted.
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback