llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)

* llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-25 10:54:36 +00:00 · 2023-09-29 09:48:45 -04:00 · 2023-09-29 09:48:45 -04:00 · 2777a84be4
commit 2777a84be4
parent 0a4a4a0982
1 changed files with 17 additions and 4 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        nthread = std::thread::hardware_concurrency();
    }
-    llama_model_loader ml(fname_inp, /*use_mmap*/ false);
+    // mmap consistently increases speed Linux, and also increases speed on Windows with
    // hot cache. It may cause a slowdown on macOS, possibly related to free memory.
 #if defined(__linux__) || defined(_WIN32)
    constexpr bool use_mmap = true;
 #else
    constexpr bool use_mmap = false;
 #endif
    llama_model_loader ml(fname_inp, use_mmap);
    if (ml.use_mmap) {
        ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
    }
    llama_model model;
    llm_load_arch(ml, model);
@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        const std::string name = ggml_get_name(tensor);
-        if (read_data.size() < ggml_nbytes(tensor)) {
+        if (!ml.use_mmap) {
-            read_data.resize(ggml_nbytes(tensor));
+            if (read_data.size() < ggml_nbytes(tensor)) {
                read_data.resize(ggml_nbytes(tensor));
            }
            tensor->data = read_data.data();
        }
        tensor->data = read_data.data();
        ml.load_data_for(tensor);
        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",