llama : add llama_init_backend() API (close #1527)

2024-12-24 10:24:35 +00:00 · 2023-05-20 11:06:11 +03:00 · 2023-05-20 11:06:11 +03:00 · ec2e10c444
commit ec2e10c444
parent d2c59b8ba4
7 changed files with 48 additions and 29 deletions
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@ -1,6 +1,7 @@
-#include <locale.h>
 #include "ggml.h"
 #include "build-info.h"
+
+#include <locale.h>
 #include <assert.h>
 #include <math.h>
 #include <cstring>
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -31,6 +31,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -96,8 +96,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
+    llama_init_backend();

    llama_context * ctx;
    g_ctx = &ctx;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -143,6 +143,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model and apply lora adapter, if any
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,7 +1,7 @@
-#include "ggml.h"
-#include "llama.h"
 #include "build-info.h"

+#include "llama.h"
+
 #include <cstdio>
 #include <map>
 #include <string>
@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 //  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 int main(int argc, char ** argv) {
-    ggml_time_init();
-
    if (argc < 3) {
        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
@ -52,12 +50,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
-    }
+    llama_init_backend();

    // parse command line arguments
    const std::string fname_inp = argv[1];
@ -116,25 +109,25 @@ int main(int argc, char ** argv) {
    }
    fprintf(stderr, "\n");

-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = llama_time_us();

    int64_t t_quantize_us = 0;

    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = llama_time_us();

        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }

-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = llama_time_us() - t_start_us;
    }

    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = llama_time_us();

        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
--- a/llama.cpp
+++ b/llama.cpp
@ -839,6 +839,21 @@ bool llama_mlock_supported() {
    return llama_mlock::SUPPORTED;
 }

+void llama_init_backend() {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+int64_t llama_time_us() {
+    return ggml_time_us();
+}
+
 //
 // model loading
 //
--- a/llama.h
+++ b/llama.h
@ -79,7 +79,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
@ -90,6 +90,13 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();

+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure