simple : minor style changes

2024-12-28 12:24:35 +00:00 · 2023-08-14 12:56:48 +03:00 · 2023-08-14 12:56:48 +03:00 · 0c19ae70d5
commit 0c19ae70d5
parent 5c5a95ba2d
4 changed files with 92 additions and 202 deletions
--- a/convert-llama-h5-to-gguf.py
+++ b/convert-llama-h5-to-gguf.py
@ -2,17 +2,18 @@

 import gguf
 import gguf_namemap as tmap
+
 import os
 import sys
 import struct
 import json
 import numpy as np
+import torch
+
 from typing import Any, List
 from pathlib import Path
-import torch
 from sentencepiece import SentencePieceProcessor

-
 #NDArray = np.ndarray[Any, Any]
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -268,7 +269,6 @@ for part_name in part_names:
    for name in model_part.keys():
        data = model_part[name]

-    
        old_dtype = data.dtype

        # we don't need these
--- a/examples/gguf/gguf-llama-simple.cpp
+++ b/examples/gguf/gguf-llama-simple.cpp
@ -6,65 +6,32 @@
 #include "gguf-llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
+    if (argc == 1 || argv[1][0] == '-') {
        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

@ -72,17 +39,14 @@ int main(int argc, char ** argv)

    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

-    if ( model == NULL )
-    {
+    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

    llama_context * ctx = llama_new_context_with_model(model, ctx_params);

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@ -90,86 +54,68 @@ int main(int argc, char ** argv)
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int)tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
+        // evaluate the transformer

-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

-
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos()) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
+        // print the new token :
        printf("%s", llama_token_to_str(ctx, new_token_id));
        fflush(stdout);

-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
        tokens_list.push_back(new_token_id);

-    } // wend of main loop
+    }

    llama_free(ctx);
    llama_free_model(model);
@ -178,5 +124,3 @@ int main(int argc, char ** argv)

    return 0;
 }
-
-// EOF
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -2,69 +2,37 @@
 #define _GNU_SOURCE
 #endif

-#include "common.h"
-#include "llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
+#include "common.h"
+#include "llama.h"
+
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
+    if (argc == 1 || argv[1][0] == '-') {
        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

@ -73,15 +41,12 @@ int main(int argc, char ** argv)

    std::tie(model, ctx) = llama_init_from_gpt_params(params);

-    if ( model == NULL )
-    {
+    if (model == NULL) {
        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
@ -89,86 +54,68 @@ int main(int argc, char ** argv)
    const int max_context_size     = llama_n_ctx(ctx);
    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int)tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
+        // evaluate the transformer

-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

-
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos()) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
+        // print the new token :
        printf("%s", llama_token_to_str(ctx, new_token_id));
        fflush(stdout);

-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
        tokens_list.push_back(new_token_id);

-    } // wend of main loop
+    }

    llama_free(ctx);
    llama_free_model(model);
@ -177,5 +124,3 @@ int main(int argc, char ** argv)

    return 0;
 }
-
-// EOF
--- a/gguf-util.h
+++ b/gguf-util.h
@ -5,7 +5,9 @@

 #ifndef GGUF_UTIL_H
 #define GGUF_UTIL_H
+
 #include "ggml.h"
+
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) {
    return std::string(buf.data(), size);
 }

-
 template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;