llama-run : improve progress bar (#10821)

Set default width to whatever the terminal is. Also fixed a small bug around default n_gpu_layers value. Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2024-12-25 19:04:35 +00:00 · 2024-12-19 02:58:00 +00:00 · 2024-12-19 02:58:00 +00:00 · 7909e8588d
commit 7909e8588d
parent 9177484f58
3 changed files with 308 additions and 130 deletions
--- a/README.md
+++ b/README.md
@ -448,7 +448,7 @@ To learn more about model quantization, [read this documentation](examples/quant
    </details>
-[^3]: [https://github.com/containers/ramalama](RamaLama)
+[^3]: [RamaLama](https://github.com/containers/ramalama)
 ## [`llama-simple`](examples/simple)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -4,7 +4,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for r
 ```bash
 llama-run granite-code
-...
+```
 ```bash
 llama-run -h
@ -19,6 +19,8 @@ Options:
      Context size (default: 2048)
  -n, --ngl <value>
      Number of GPU layers (default: 0)
  -v, --verbose, --log-verbose
      Set verbosity level to infinity (i.e. log all messages, useful for debugging)
  -h, --help
      Show help message
@ -42,6 +44,6 @@ Examples:
  llama-run https://example.com/some-file1.gguf
  llama-run some-file2.gguf
  llama-run file://some-file3.gguf
-  llama-run --ngl 99 some-file4.gguf
+  llama-run --ngl 999 some-file4.gguf
-  llama-run --ngl 99 some-file5.gguf Hello World
+  llama-run --ngl 999 some-file5.gguf Hello World
-...
+```
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -1,6 +1,8 @@
 #if defined(_WIN32)
 #    include <windows.h>
 #else
 #    include <sys/file.h>
 #    include <sys/ioctl.h>
 #    include <unistd.h>
 #endif
@ -8,6 +10,7 @@
 #    include <curl/curl.h>
 #endif
 #include <climits>
 #include <cstdarg>
 #include <cstdio>
 #include <cstring>
@ -21,15 +24,37 @@
 #include "json.hpp"
 #include "llama-cpp.h"
-#define printe(...)                   \
+GGML_ATTRIBUTE_FORMAT(1, 2)
-    do {                              \
+static std::string fmt(const char * fmt, ...) {
-        fprintf(stderr, __VA_ARGS__); \
+    va_list ap;
-    } while (0)
+    va_list ap2;
    va_start(ap, fmt);
    va_copy(ap2, ap);
    const int size = vsnprintf(NULL, 0, fmt, ap);
    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
    std::string buf;
    buf.resize(size);
    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
    GGML_ASSERT(size2 == size);
    va_end(ap2);
    va_end(ap);
    return buf;
 }
 GGML_ATTRIBUTE_FORMAT(1, 2)
 static int printe(const char * fmt, ...) {
    va_list args;
    va_start(args, fmt);
    const int ret = vfprintf(stderr, fmt, args);
    va_end(args);
    return ret;
 }
 class Opt {
  public:
    int init(int argc, const char ** argv) {
        construct_help_str_();
        // Parse arguments
        if (parse(argc, argv)) {
            printe("Error: Failed to parse arguments.\n");
@ -48,14 +73,64 @@ class Opt {
    std::string model_;
    std::string user_;
-    int         context_size_ = 2048, ngl_ = -1;
+    int         context_size_ = -1, ngl_ = -1;
    bool        verbose_ = false;
  private:
    std::string help_str_;
    bool        help_ = false;
-    void construct_help_str_() {
+    bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
-        help_str_ =
+        return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
    }
    int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) {
        if (i + 1 >= argc) {
            return 1;
        }
        option_value = std::atoi(argv[++i]);
        return 0;
    }
    int parse(int argc, const char ** argv) {
        bool options_parsing   = true;
        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
            if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
                if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
                    return 1;
                }
            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
                if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
                    return 1;
                }
            } else if (options_parsing &&
                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
                verbose_ = true;
            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
                help_ = true;
                return 0;
            } else if (options_parsing && strcmp(argv[i], "--") == 0) {
                options_parsing = false;
            } else if (positional_args_i == 0) {
                if (!argv[i][0] || argv[i][0] == '-') {
                    return 1;
                }
                ++positional_args_i;
                model_ = argv[i];
            } else if (positional_args_i == 1) {
                ++positional_args_i;
                user_ = argv[i];
            } else {
                user_ += " " + std::string(argv[i]);
            }
        }
        return 0;
    }
    void help() const {
        printf(
            "Description:\n"
            "  Runs a llm\n"
            "\n"
@ -64,15 +139,11 @@ class Opt {
            "\n"
            "Options:\n"
            "  -c, --context-size <value>\n"
-            "      Context size (default: " +
+            "      Context size (default: %d)\n"
            std::to_string(context_size_);
        help_str_ +=
            ")\n"
            "  -n, --ngl <value>\n"
-            "      Number of GPU layers (default: " +
+            "      Number of GPU layers (default: %d)\n"
-            std::to_string(ngl_);
+            "  -v, --verbose, --log-verbose\n"
-        help_str_ +=
+            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
            ")\n"
            "  -h, --help\n"
            "      Show help message\n"
            "\n"
@ -92,47 +163,15 @@ class Opt {
            "  llama-run ollama://granite-code\n"
            "  llama-run ollama://smollm:135m\n"
            "  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
-            "  llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
+            "  llama-run "
            "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
            "  llama-run https://example.com/some-file1.gguf\n"
            "  llama-run some-file2.gguf\n"
            "  llama-run file://some-file3.gguf\n"
-            "  llama-run --ngl 99 some-file4.gguf\n"
+            "  llama-run --ngl 999 some-file4.gguf\n"
-            "  llama-run --ngl 99 some-file5.gguf Hello World\n";
+            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
            llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
    }
    int parse(int argc, const char ** argv) {
        int positional_args_i = 0;
        for (int i = 1; i < argc; ++i) {
            if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
                if (i + 1 >= argc) {
                    return 1;
                }
                context_size_ = std::atoi(argv[++i]);
            } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
                if (i + 1 >= argc) {
                    return 1;
                }
                ngl_ = std::atoi(argv[++i]);
            } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
                help_ = true;
                return 0;
            } else if (!positional_args_i) {
                ++positional_args_i;
                model_ = argv[i];
            } else if (positional_args_i == 1) {
                ++positional_args_i;
                user_ = argv[i];
            } else {
                user_ += " " + std::string(argv[i]);
            }
        }
        return model_.empty();  // model_ is the only required value
    }
    void help() const { printf("%s", help_str_.c_str()); }
 };
 struct progress_data {
@ -141,18 +180,85 @@ struct progress_data {
    bool                                  printed    = false;
 };
-struct FileDeleter {
+static int get_terminal_width() {
-    void operator()(FILE * file) const {
+#if defined(_WIN32)
    CONSOLE_SCREEN_BUFFER_INFO csbi;
    GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
    return csbi.srWindow.Right - csbi.srWindow.Left + 1;
 #else
    struct winsize w;
    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
    return w.ws_col;
 #endif
 }
 #ifdef LLAMA_USE_CURL
 class File {
  public:
    FILE * file = nullptr;
    FILE * open(const std::string & filename, const char * mode) {
        file = fopen(filename.c_str(), mode);
        return file;
    }
    int lock() {
        if (file) {
 #    ifdef _WIN32
            fd    = _fileno(file);
            hFile = (HANDLE) _get_osfhandle(fd);
            if (hFile == INVALID_HANDLE_VALUE) {
                fd = -1;
                return 1;
            }
            OVERLAPPED overlapped = { 0 };
            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
                            &overlapped)) {
                fd = -1;
                return 1;
            }
 #    else
            fd = fileno(file);
            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
                fd = -1;
                return 1;
            }
 #    endif
        }
        return 0;
    }
    ~File() {
        if (fd >= 0) {
 #    ifdef _WIN32
            if (hFile != INVALID_HANDLE_VALUE) {
                OVERLAPPED overlapped = { 0 };
                UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
            }
 #    else
            flock(fd, LOCK_UN);
 #    endif
        }
        if (file) {
            fclose(file);
        }
    }
  private:
    int fd = -1;
 #    ifdef _WIN32
    HANDLE hFile;
 #    endif
 };
-typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
+class HttpClient {
 #ifdef LLAMA_USE_CURL
 class CurlWrapper {
  public:
    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
             const bool progress, std::string * response_str = nullptr) {
@ -163,10 +269,20 @@ class CurlWrapper {
        }
        progress_data data;
-        FILE_ptr      out;
+        File          out;
        if (!output_file.empty()) {
            output_file_partial = output_file + ".partial";
-            out.reset(fopen(output_file_partial.c_str(), "ab"));
+            if (!out.open(output_file_partial, "ab")) {
                printe("Failed to open file\n");
                return 1;
            }
            if (out.lock()) {
                printe("Failed to exclusively lock file\n");
                return 1;
            }
        }
        set_write_options(response_str, out);
@ -181,7 +297,7 @@ class CurlWrapper {
        return 0;
    }
-    ~CurlWrapper() {
+    ~HttpClient() {
        if (chunk) {
            curl_slist_free_all(chunk);
        }
@ -195,13 +311,13 @@ class CurlWrapper {
    CURL *              curl  = nullptr;
    struct curl_slist * chunk = nullptr;
-    void set_write_options(std::string * response_str, const FILE_ptr & out) {
+    void set_write_options(std::string * response_str, const File & out) {
        if (response_str) {
            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
            curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
        } else {
            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
-            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get());
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file);
        }
    }
@ -219,7 +335,7 @@ class CurlWrapper {
        if (progress) {
            curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
            curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
-            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress);
        }
    }
@ -255,17 +371,13 @@ class CurlWrapper {
        int mins = (static_cast<int>(seconds) % 3600) / 60;
        int secs = static_cast<int>(seconds) % 60;
        std::ostringstream out;
        if (hrs > 0) {
-            out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0')
+            return fmt("%dh %02dm %02ds", hrs, mins, secs);
                << secs << "s";
        } else if (mins > 0) {
-            out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s";
+            return fmt("%dm %02ds", mins, secs);
        } else {
-            out << secs << "s";
+            return fmt("%ds", secs);
        }
        return out.str();
    }
    static std::string human_readable_size(curl_off_t size) {
@ -279,12 +391,10 @@ class CurlWrapper {
            }
        }
-        std::ostringstream out;
+        return fmt("%.2f %s", dbl_size, suffix[i]);
        out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i];
        return out.str();
    }
-    static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
                               curl_off_t) {
        progress_data * data = static_cast<progress_data *>(ptr);
        if (total_to_download <= 0) {
@ -293,27 +403,68 @@ class CurlWrapper {
        total_to_download += data->file_size;
        const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
-        const curl_off_t percentage                    = (now_downloaded_plus_file_size * 100) / total_to_download;
+        const curl_off_t percentage      = calculate_percentage(now_downloaded_plus_file_size, total_to_download);
-        const curl_off_t pos                           = (percentage / 5);
+        std::string      progress_prefix = generate_progress_prefix(percentage);
        std::string progress_bar;
        for (int i = 0; i < 20; ++i) {
            progress_bar.append((i < pos) ? "█" : " ");
        }
-        // Calculate download speed and estimated time to completion
+        const double speed = calculate_speed(now_downloaded, data->start_time);
-        const auto                          now             = std::chrono::steady_clock::now();
+        const double tim   = (total_to_download - now_downloaded) / speed;
-        const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
+        std::string  progress_suffix =
-        const double                        speed           = now_downloaded / elapsed_seconds.count();
+            generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim);
-        const double                        estimated_time  = (total_to_download - now_downloaded) / speed;
+
-        printe("\r%ld%% |%s| %s/%s  %.2f MB/s  %s      ", percentage, progress_bar.c_str(),
+        int         progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix);
-               human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
+        std::string progress_bar;
-               speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
+        generate_progress_bar(progress_bar_width, percentage, progress_bar);
-        fflush(stderr);
+
        print_progress(progress_prefix, progress_bar, progress_suffix);
        data->printed = true;
        return 0;
    }
    static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) {
        return (now_downloaded_plus_file_size * 100) / total_to_download;
    }
    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
        const auto                          now             = std::chrono::steady_clock::now();
        const std::chrono::duration<double> elapsed_seconds = now - start_time;
        return now_downloaded / elapsed_seconds.count();
    }
    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
                                                double speed, double estimated_time) {
        const int width = 10;
        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
                   human_readable_time(estimated_time).c_str());
    }
    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
        int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3;
        if (progress_bar_width < 1) {
            progress_bar_width = 1;
        }
        return progress_bar_width;
    }
    static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage,
                                             std::string & progress_bar) {
        const curl_off_t pos = (percentage * progress_bar_width) / 100;
        for (int i = 0; i < progress_bar_width; ++i) {
            progress_bar.append((i < pos) ? "█" : " ");
        }
        return progress_bar;
    }
    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
                               const std::string & progress_suffix) {
        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
               progress_suffix.c_str());
    }
    // Function to write data to a file
    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
        FILE * out = static_cast<FILE *>(stream);
@ -357,8 +508,8 @@ class LlamaData {
 #ifdef LLAMA_USE_CURL
    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
                 const bool progress, std::string * response_str = nullptr) {
-        CurlWrapper curl;
+        HttpClient http;
-        if (curl.init(url, headers, output_file, progress, response_str)) {
+        if (http.init(url, headers, output_file, progress, response_str)) {
            return 1;
        }
@ -438,13 +589,17 @@ class LlamaData {
    }
    int resolve_model(std::string & model_) {
        int                            ret     = 0;
        if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
            remove_proto(model_);
            return ret;
        }
        const std::string              bn      = basename(model_);
        const std::vector<std::string> headers = { "--header",
                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
-        int                            ret     = 0;
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
        if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) {
            remove_proto(model_);
        } else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
            remove_proto(model_);
            ret = huggingface_dl(model_, headers, bn);
        } else if (string_starts_with(model_, "ollama://")) {
@ -467,19 +622,23 @@ class LlamaData {
        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers       = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
        resolve_model(opt.model_);
        printe(
            "\r%*s"
            "\rLoading model",
            get_terminal_width(), " ");
        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
        if (!model) {
            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
        }
        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
        return model;
    }
    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
        llama_context_params ctx_params = llama_context_default_params();
-        ctx_params.n_ctx                = n_ctx;
+        ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
        ctx_params.n_batch              = n_ctx;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
            printe("%s: error: failed to create the llama_context\n", __func__);
@ -609,16 +768,20 @@ static int read_user_input(std::string & user) {
 }
 // Function to generate a response based on the prompt
-static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response,
                             const bool stdout_a_terminal) {
    // Set response color
    if (stdout_a_terminal) {
        printf("\033[33m");
    }
    if (generate(llama_data, prompt, response)) {
        printe("failed to generate response\n");
        return 1;
    }
    // End response with color reset and newline
-    printf("\n\033[0m");
+    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
    return 0;
 }
@ -642,15 +805,37 @@ static int handle_user_input(std::string & user_input, const std::string & user_
    }
    printf(
-        "\r                                                                       "
+        "\r%*s"
-        "\r\033[32m> \033[0m");
+        "\r\033[32m> \033[0m",
        get_terminal_width(), " ");
    return read_user_input(user_input);  // Returns true if input ends the loop
 }
 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
    DWORD  mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
 #endif
 }
 static bool is_stdout_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
    DWORD  mode;
    return GetConsoleMode(hStdout, &mode);
 #else
    return isatty(STDOUT_FILENO);
 #endif
 }
 // Function to tokenize the prompt
 static int chat_loop(LlamaData & llama_data, const std::string & user_) {
    int prev_len = 0;
    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
    static const bool stdout_a_terminal = is_stdout_a_terminal();
    while (true) {
        // Get user input
        std::string user_input;
@ -665,7 +850,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
        std::string response;
-        if (generate_response(llama_data, prompt, response)) {
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
            return 1;
        }
@ -682,22 +867,13 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
    return 0;
 }
-static void log_callback(const enum ggml_log_level level, const char * text, void *) {
+static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
-    if (level == GGML_LOG_LEVEL_ERROR) {
+    const Opt * opt = static_cast<Opt *>(p);
    if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
        printe("%s", text);
    }
 }
 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
    DWORD  mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
 #endif
 }
 static std::string read_pipe_data() {
    std::ostringstream result;
    result << std::cin.rdbuf();  // Read all data from std::cin
@ -721,7 +897,7 @@ int main(int argc, const char ** argv) {
        opt.user_ += read_pipe_data();
    }
-    llama_log_set(log_callback, nullptr);
+    llama_log_set(log_callback, &opt);
    LlamaData llama_data;
    if (llama_data.init(opt)) {
        return 1;