From 035d51145736d44e68ac0ef26eb00a8d159df47d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 18 Aug 2023 17:06:34 +0300
Subject: [PATCH] llama : minor API updates

---
 llama.cpp | 64 +++++++++++++++++++++++++++++--------------------------
 llama.h   | 37 +++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 43 deletions(-)
diff --git a/llama.cpp b/llama.cpp
index 87e2d5b11..4f2af4664 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -3002,7 +3002,7 @@ void llama_grammar_free(struct llama_grammar * grammar) {
 //
 
 void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates) {
-    assert(candidates->size > 0);
+    GGML_ASSERT(candidates->size > 0);
 
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -3282,7 +3282,7 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
 }
 
 void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
-    assert(ctx);
+    GGML_ASSERT(ctx);
     const int64_t t_start_sample_us = ggml_time_us();
 
     bool allow_eos = false;
@@ -3342,10 +3342,12 @@ void llama_sample_classifier_free_guidance(
                          float   scale) {
     int64_t t_start_sample_us = ggml_time_us();
 
-    assert(ctx);
+    GGML_ASSERT(ctx);
+
     auto n_vocab = llama_n_vocab(ctx);
-    assert(n_vocab == (int)candidates->size);
-    assert(!candidates->sorted);
+
+    GGML_ASSERT(n_vocab == (int)candidates->size);
+    GGML_ASSERT(!candidates->sorted);
 
     std::vector<float> logits_base;
     logits_base.reserve(candidates->size);
@@ -3369,7 +3371,8 @@ void llama_sample_classifier_free_guidance(
 }
 
 llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu) {
-    assert(ctx);
+    GGML_ASSERT(ctx);
+
     auto N = float(llama_n_vocab(ctx));
     int64_t t_start_sample_us;
     t_start_sample_us = ggml_time_us();
@@ -3475,7 +3478,8 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
 }
 
 llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) {
-    assert(ctx);
+    GGML_ASSERT(ctx);
+
     const int64_t t_start_sample_us = ggml_time_us();
     llama_sample_softmax(nullptr, candidates);
 
@@ -4463,18 +4467,22 @@ int llama_n_embd(const struct llama_context * ctx) {
     return ctx->model.hparams.n_embd;
 }
 
-int llama_n_vocab_from_model(const struct llama_model * model) {
+int llama_model_n_vocab(const struct llama_model * model) {
     return model->vocab.id_to_token.size();
 }
 
-int llama_n_ctx_from_model(const struct llama_model * model) {
+int llama_model_n_ctx(const struct llama_model * model) {
     return model->hparams.n_ctx;
 }
 
-int llama_n_embd_from_model(const struct llama_model * model) {
+int llama_model_n_embd(const struct llama_model * model) {
     return model->hparams.n_embd;
 }
 
+int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
+    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
+}
+
 int llama_model_quantize(
         const char * fname_inp,
         const char * fname_out,
@@ -4965,14 +4973,10 @@ int llama_get_vocab(
         const char * * strings,
         float  * scores,
         int capacity) {
-    return llama_get_vocab_from_model(&ctx->model, strings, scores, capacity);
+    return llama_model_get_vocab(&ctx->model, strings, scores, capacity);
 }
 
-int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
-    return snprintf(buf, buf_size, "LLaMA %s %s", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype));
-}
-
-int llama_get_vocab_from_model(
+int llama_model_get_vocab(
         const struct llama_model * model,
         const char * * strings,
         float  * scores,
@@ -4985,6 +4989,18 @@ int llama_get_vocab_from_model(
     return n;
 }
 
+llama_token llama_token_bos(void) {
+    return 1;
+}
+
+llama_token llama_token_eos(void) {
+    return 2;
+}
+
+llama_token llama_token_nl(void) {
+    return 13;
+}
+
 int llama_tokenize(
         struct llama_context * ctx,
                   const char * text,
@@ -5040,7 +5056,7 @@ int llama_token_to_str(const struct llama_context * ctx, llama_token token, char
 }
 
 int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token, char * buf, int length) {
-    if (0 <= token && token < llama_n_vocab_from_model(&ctx->model)) {
+    if (0 <= token && token < llama_model_n_vocab(&ctx->model)) {
         std::string result = ctx->model.vocab.id_to_token[token].tok;
         if (length < (int) result.length()) {
             return -result.length();
@@ -5053,7 +5069,7 @@ int llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token,
 
 // does not write null-terminator to str
 int llama_token_to_str_with_model(const struct llama_model * model, llama_token token, char * buf, int length) {
-    if (0 <= token && token < llama_n_vocab_from_model(model)) {
+    if (0 <= token && token < llama_model_n_vocab(model)) {
         if (llama_is_normal_token(model->vocab, token)) {
             std::string result = model->vocab.id_to_token[token].tok;
             if (llama_vocab_type(model->vocab) == "spm") {
@@ -5085,18 +5101,6 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
     return 0;
 }
 
-llama_token llama_token_bos(void) {
-    return 1;
-}
-
-llama_token llama_token_eos(void) {
-    return 2;
-}
-
-llama_token llama_token_nl(void) {
-    return 13;
-}
-
 struct llama_timings llama_get_timings(struct llama_context * ctx) {
     struct llama_timings result = {
         /*.t_start_ms  =*/ 1e-3 * ctx->t_start_us,
diff --git a/llama.h b/llama.h
index 5126b8193..e2b28afbb 100644
--- a/llama.h
+++ b/llama.h
@@ -199,7 +199,6 @@ extern "C" {
     LLAMA_API struct llama_context_params llama_context_default_params(void);
     LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
 
-    // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
     // If numa is true, use NUMA optimizations
     // Call once at the start of the program
@@ -231,9 +230,9 @@ extern "C" {
     LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
     LLAMA_API int llama_n_embd (const struct llama_context * ctx);
 
-    LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
-    LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+    LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
+    LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
+    LLAMA_API int llama_model_n_embd (const struct llama_model * model);
 
     // Get a string describing the model type
     LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
@@ -259,9 +258,9 @@ extern "C" {
 
     LLAMA_API int llama_model_apply_lora_from_file(
             const struct llama_model * model,
-                      const char * path_lora,
-                      const char * path_base_model,
-                             int   n_threads);
+                          const char * path_lora,
+                          const char * path_base_model,
+                                 int   n_threads);
 
     // Returns the number of tokens in the KV cache
     LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
@@ -322,6 +321,10 @@ extern "C" {
     // shape: [n_embd] (1-dimensional)
     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 
+    //
+    // Vocab
+    //
+
     // Get the vocabulary as output parameters.
     // Returns number of results.
     LLAMA_API int llama_get_vocab(
@@ -330,17 +333,25 @@ extern "C" {
                                  float * scores,
                                    int   capacity);
 
-    LLAMA_API int llama_get_vocab_from_model(
+    LLAMA_API int llama_model_get_vocab(
               const struct llama_model * model,
                           const char * * strings,
                                  float * scores,
                                    int   capacity);
 
+    // Special tokens
+    LLAMA_API llama_token llama_token_bos(/*struct llama_model * model*/ void);  // beginning-of-sentence
+    LLAMA_API llama_token llama_token_eos(/*struct llama_model * model*/ void);  // end-of-sentence
+    LLAMA_API llama_token llama_token_nl (/*struct llama_model * model*/ void);  // next-line
+
+    //
+    // Tokenization
+    //
+
     // Convert the provided text into tokens.
     // The tokens pointer must be large enough to hold the resulting tokens.
     // Returns the number of tokens on success, no more than n_max_tokens
     // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
     LLAMA_API int llama_tokenize(
             struct llama_context * ctx,
                       const char * text,
@@ -381,13 +392,11 @@ extern "C" {
                            llama_token   token,
                                   char * buf,
                                   int    length);
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos(/*struct llama_model * model*/ void);  // beginning-of-sentence
-    LLAMA_API llama_token llama_token_eos(/*struct llama_model * model*/ void);  // end-of-sentence
-    LLAMA_API llama_token llama_token_nl (/*struct llama_model * model*/ void);  // next-line
 
+    //
     // Grammar
     //
+
     LLAMA_API struct llama_grammar * llama_grammar_init(
             const llama_grammar_element ** rules,
                                  size_t    n_rules,
@@ -395,7 +404,9 @@ extern "C" {
 
     LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
 
+    //
     // Sampling functions
+    //
 
     /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
     LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);