llama : re-order functions

2024-12-27 03:44:35 +00:00 · 2023-08-18 14:56:36 +03:00 · 2023-08-18 14:56:36 +03:00 · 660ca9bbca
commit 660ca9bbca
parent dea5be61d7
3 changed files with 206 additions and 205 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -880,7 +880,6 @@ struct llama_context {
    std::vector<uint8_t> work_buffer;

    // memory buffers used to evaluate the model
-    // TODO: move in llama_state
    llama_buffer buf_compute;

 #ifdef LLAMA_USE_ALLOCATOR
@ -2547,7 +2546,7 @@ private:
    std::map<std::string, std::pair<int, int> > rev_merge;
 };

-static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
+static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
    llama_tokenizer tokenizer(vocab);
    std::vector<llama_vocab::id> output;

@ -4399,6 +4398,30 @@ void llama_free(struct llama_context * ctx) {
    delete ctx;
 }

+int llama_n_vocab(const struct llama_context * ctx) {
+    return ctx->model.vocab.id_to_token.size();
+}
+
+int llama_n_ctx(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_ctx;
+}
+
+int llama_n_embd(const struct llama_context * ctx) {
+    return ctx->model.hparams.n_embd;
+}
+
+int llama_n_vocab_from_model(const struct llama_model * model) {
+    return model->vocab.id_to_token.size();
+}
+
+int llama_n_ctx_from_model(const struct llama_model * model) {
+    return model->hparams.n_ctx;
+}
+
+int llama_n_embd_from_model(const struct llama_model * model) {
+    return model->hparams.n_embd;
+}
+
 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
@ -4876,114 +4899,20 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
    return 0;
 }

-int llama_tokenize_with_model(
-    const struct llama_model * model,
-                  const char * text,
-                 llama_token * tokens,
-                         int   n_max_tokens,
-                        bool   add_bos) {
-    auto escape = llama_vocab_type(model->vocab) == "spm";
-    auto res = llama_tokenize(model->vocab, text, add_bos, escape);
-
-    if (n_max_tokens < (int) res.size()) {
-        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-        return -((int) res.size());
+float * llama_get_logits(struct llama_context * ctx) {
+    return ctx->logits.data();
 }

-    for (size_t i = 0; i < res.size(); i++) {
-        tokens[i] = res[i];
+float * llama_get_embeddings(struct llama_context * ctx) {
+    return ctx->embedding.data();
 }

-    return res.size();
-}
-
-int llama_tokenize(
-        struct llama_context * ctx,
-                  const char * text,
-                 llama_token * tokens,
-                         int   n_max_tokens,
-                        bool   add_bos) {
-    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
-}
-
-std::vector<llama_token> llama_tokenize(
-        struct llama_context * ctx,
-           const std::string & text,
-                        bool   add_bos) {
-    int length = text.length() + add_bos;
-    std::vector<llama_token> result(length);
-    length = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
-    if (length < 0) {
-        result.resize(-length);
-        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
-        assert(check == -length);
-        GGML_UNUSED(check);
-    } else {
-        result.resize(length);
-    }
-    return result;
-}
-
-int llama_tokenize_bpe(
-        struct llama_context * ctx,
-                  const char * text,
-                 llama_token * tokens,
-                         int   n_max_tokens,
-                        bool   add_bos) {
-    auto res = llama_tokenize(ctx->model.vocab, text, add_bos, false);
-
-    if (n_max_tokens < (int) res.size()) {
-        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
-        return -((int) res.size());
-    }
-
-    for (size_t i = 0; i < res.size(); i++) {
-        tokens[i] = res[i];
-    }
-
-    return res.size();
-}
-
-std::vector<llama_token> llama_tokenize_bpe(
-        struct llama_context * ctx,
-           const std::string & text,
-                        bool   add_bos) {
-    int length = text.length() + add_bos;
-    std::vector<llama_token> result(length);
-    length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
-    if (length < 0) {
-        result.resize(-length);
-        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
-        assert(check == -length);
-        GGML_UNUSED(check);
-    } else {
-        result.resize(length);
-    }
-    return result;
-}
-
-int llama_n_vocab_from_model(const struct llama_model * model) {
-    return model->vocab.id_to_token.size();
-}
-
-int llama_n_ctx_from_model(const struct llama_model * model) {
-    return model->hparams.n_ctx;
-}
-
-int llama_n_embd_from_model(const struct llama_model * model) {
-    return model->hparams.n_embd;
-}
-
-int llama_n_vocab(const struct llama_context * ctx) {
-    return ctx->model.vocab.id_to_token.size();
-}
-
-int llama_n_ctx(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_ctx;
-}
-
-int llama_n_embd(const struct llama_context * ctx) {
-    return ctx->model.hparams.n_embd;
+int llama_get_vocab(
+        const struct llama_context * ctx,
+        const char * * strings,
+        float  * scores,
+        int capacity) {
+    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
 }

 int llama_get_vocab_from_model(
@ -4999,20 +4928,70 @@ int llama_get_vocab_from_model(
    return n;
 }

-int llama_get_vocab(
-        const struct llama_context * ctx,
-        const char * * strings,
-        float  * scores,
-        int capacity) {
-    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
+int llama_tokenize(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
 }

-float * llama_get_logits(struct llama_context * ctx) {
-    return ctx->logits.data();
+int llama_tokenize_bpe(
+        struct llama_context * ctx,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    auto res = llama_tokenize_internal(ctx->model.vocab, text, add_bos, false);
+
+    if (n_max_tokens < (int) res.size()) {
+        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        return -((int) res.size());
    }

-float * llama_get_embeddings(struct llama_context * ctx) {
-    return ctx->embedding.data();
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+int llama_tokenize_with_model(
+    const struct llama_model * model,
+                  const char * text,
+                 llama_token * tokens,
+                         int   n_max_tokens,
+                        bool   add_bos) {
+    auto escape = llama_vocab_type(model->vocab) == "spm";
+    auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
+
+    if (n_max_tokens < (int) res.size()) {
+        LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
+        return -((int) res.size());
+    }
+
+    for (size_t i = 0; i < res.size(); i++) {
+        tokens[i] = res[i];
+    }
+
+    return res.size();
+}
+
+int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
+    return llama_token_to_str_with_model(&ctx->model, token, buf, length);
+}
+
+int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
+    if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) {
+        std::string result = ctx->model.vocab.id_to_token[token].tok;
+        if (length < (int) result.length()) {
+            return -result.length();
+        }
+        memcpy(buf, result.c_str(), result.length());
+        return result.length();
+    }
+    return 0;
 }

 // does not write null-terminator to str
@ -5049,50 +5028,6 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
    return 0;
 }

-int llama_token_to_str(const struct llama_context * ctx, llama_token token, char * buf, int length) {
-    return llama_token_to_str_with_model(&ctx->model, token, buf, length);
-}
-
-std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
-    std::vector<char> result(8, 0);
-    const int length = llama_token_to_str(ctx, token, result.data(), result.size());
-    if (length < 0) {
-        result.resize(-length);
-        int check = llama_token_to_str(ctx, token, result.data(), result.size());
-        GGML_ASSERT(check == -length);
-    } else {
-        result.resize(length);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
-int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
-    if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) {
-        std::string result = ctx->model.vocab.id_to_token[token].tok;
-        if (length < (int) result.length()) {
-            return -result.length();
-        }
-        memcpy(buf, result.c_str(), result.length());
-        return result.length();
-    }
-    return 0;
-}
-
-std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
-    std::vector<char> result(8, 0);
-    const int length = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
-    if (length < 0) {
-        result.resize(-length);
-        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
-        GGML_ASSERT(check == -length);
-    } else {
-        result.resize(length);
-    }
-
-    return std::string(result.data(), result.size());
-}
-
 llama_token llama_token_bos(void) {
    return 1;
 }
@ -5165,6 +5100,73 @@ const char * llama_print_system_info(void) {
    return s.c_str();
 }

+
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -n_tokens);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int length = text.length() + add_bos;
+    std::vector<llama_token> result(length);
+    length = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        assert(check == -length);
+        GGML_UNUSED(check);
+    } else {
+        result.resize(length);
+    }
+    return result;
+}
+
+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int length = llama_token_to_str(ctx, token, result.data(), result.size());
+    if (length < 0) {
+        result.resize(-length);
+        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -length);
+    } else {
+        result.resize(length);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int length = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+    if (length < 0) {
+        result.resize(-length);
+        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -length);
+    } else {
+        result.resize(length);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
+
 // For internal test use
 const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
    return ctx->model.tensors_by_name;
--- a/llama.h
+++ b/llama.h
@ -199,20 +199,15 @@ extern "C" {
    LLAMA_API struct llama_context_params llama_context_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);

-    LLAMA_API int  llama_max_devices(void);
-    LLAMA_API bool llama_mmap_supported(void);
-    LLAMA_API bool llama_mlock_supported(void);
-
    // TODO: not great API - very likely to change
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
    LLAMA_API void llama_backend_init(bool numa);
+
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);

-    LLAMA_API int64_t llama_time_us(void);
-
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
            struct llama_context_params   params);
@ -223,10 +218,23 @@ extern "C" {
                     struct llama_model * model,
            struct llama_context_params   params);

-
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

+    LLAMA_API int64_t llama_time_us(void);
+
+    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API bool llama_mmap_supported (void);
+    LLAMA_API bool llama_mlock_supported(void);
+
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+
+    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
@ -300,6 +308,31 @@ extern "C" {
    // IMPORTANT: do not use for anything else other than debugging and testing!
    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);

+    // Token logits obtained from the last call to llama_eval()
+    // The logits for the last token are stored in the last row
+    // Can be mutated in order to change the probabilities of the next token
+    // Rows: n_tokens
+    // Cols: n_vocab
+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
+    // Get the vocabulary as output parameters.
+    // Returns number of results.
+    LLAMA_API int llama_get_vocab(
+            const struct llama_context * ctx,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
+    LLAMA_API int llama_get_vocab_from_model(
+              const struct llama_model * model,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -326,39 +359,6 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);

-    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-
-    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
-    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
-
-    // Get the vocabulary as output parameters.
-    // Returns number of results.
-    LLAMA_API int llama_get_vocab(
-            const struct llama_context * ctx,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
-
-    LLAMA_API int llama_get_vocab_from_model(
-              const struct llama_model * model,
-                          const char * * strings,
-                                 float * scores,
-                                   int   capacity);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
    // Token Id -> String. Uses the vocabulary in the provided context
    // Does not write null terminator to the buffer
    LLAMA_API int llama_token_to_str(
@ -379,9 +379,9 @@ extern "C" {
                                  char * buf,
                                  int    length);
    // Special tokens
-    LLAMA_API llama_token llama_token_bos(void);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(void);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl(void);   // next-line
+    LLAMA_API llama_token llama_token_bos(/*struct llama_model * model*/ void);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(/*struct llama_model * model*/ void);  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (/*struct llama_model * model*/ void);  // next-line

    // Grammar
    //
--- a/tests/test-tokenizer-1.cpp
+++ b/tests/test-tokenizer-1.cpp
@ -114,8 +114,7 @@ int main(int argc, char **argv) {
        }
        std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
        if (tokens.size() == 1) {
-            fprintf(stderr, "%s : info: %s tokenized to %d \n",
-                __func__, str.c_str(), tokens[0]);
+            fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
        }
    }