simple : minor style changes

2024-12-28 12:24:35 +00:00 · 2023-08-14 12:56:48 +03:00 · 2023-08-14 12:56:48 +03:00 · 0c19ae70d5
commit 0c19ae70d5
parent 5c5a95ba2d
4 changed files with 92 additions and 202 deletions
--- a/convert-llama-h5-to-gguf.py
+++ b/convert-llama-h5-to-gguf.py
@ -2,17 +2,18 @@
 import gguf
 import gguf_namemap as tmap
 import os
 import sys
 import struct
 import json
 import numpy as np
 import torch
 from typing import Any, List
 from pathlib import Path
 import torch
 from sentencepiece import SentencePieceProcessor
 #NDArray = np.ndarray[Any, Any]
 # compatible with python < 3.9
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -225,7 +226,7 @@ for part_name in part_names:
            sys.exit()
        n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data.dtype == np.float16:
@ -268,7 +269,6 @@ for part_name in part_names:
    for name in model_part.keys():
        data = model_part[name]
        old_dtype = data.dtype
        # we don't need these
@ -295,7 +295,7 @@ for part_name in part_names:
            sys.exit()
        n_dims = len(data.shape)
-        data_dtype = data.dtype 
+        data_dtype = data.dtype
        # if f32 desired, convert any float16 to float32
        if ftype == 0 and data.dtype == np.float16:
--- a/examples/gguf/gguf-llama-simple.cpp
+++ b/examples/gguf/gguf-llama-simple.cpp
@ -6,177 +6,121 @@
 #include "gguf-llama.h"
 #include "build-info.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+int main(int argc, char ** argv) {
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
 int main(int argc, char ** argv)
 {
    gpt_params params;
-    //---------------------------------
+    if (argc == 1 || argv[1][0] == '-') {
-    // Print help :
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
    //---------------------------------
    if ( argc == 1 || argv[1][0] == '-' )
    {
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
        return 1 ;
    }
-    //---------------------------------
+    if (argc >= 2) {
    // Load parameters :
    //---------------------------------
    if ( argc >= 2 )
    {
        params.model = argv[1];
    }
-    if ( argc >= 3 )
+    if (argc >= 3) {
    {
        params.prompt = argv[2];
    }
-    if ( params.prompt.empty() )
+    if (params.prompt.empty()) {
    {
        params.prompt = "Hello my name is";
    }
-    //---------------------------------
+    // init LLM
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
    llama_context_params ctx_params = llama_context_default_params();
    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
-    
+
-    if ( model == NULL )
+    if (model == NULL) {
-    {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-    //---------------------------------
+    // tokenize the prompt
    // Tokenize the prompt :
    //---------------------------------
    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_tokens_list_size = max_context_size - 4;
-    if ( (int)tokens_list.size() > max_tokens_list_size )
+    if ((int)tokens_list.size() > max_tokens_list_size) {
-    {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
        return 1;
    }
-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");
-    // Print the tokens from the prompt :
+    for (auto id : tokens_list) {
-
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
    for( auto id : tokens_list )
    {
        printf( "%s" , llama_token_to_str( ctx , id ) );
    }
-    fflush(stdout);
+    fflush(stderr);
-
+    // main loop
    //---------------------------------
    // Main prediction loop :
    //---------------------------------
    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
+    while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
-    {
+        // evaluate the transformer
        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------
-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
-        {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
            return 1;
        }
        tokens_list.clear();
-        //---------------------------------
+        // sample the next token
        // Select the best prediction :
        //---------------------------------
        llama_token new_token_id = 0;
-        auto logits  = llama_get_logits( ctx );
+        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);
-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
+        if (new_token_id == llama_token_eos()) {
        {
            fprintf(stderr, " [end of text]\n");
            break;
        }
-        // Print the new token :
+        // print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        printf("%s", llama_token_to_str(ctx, new_token_id));
-        fflush( stdout );
+        fflush(stdout);
-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
-        tokens_list.push_back( new_token_id );
+        tokens_list.push_back(new_token_id);
-    } // wend of main loop
+    }
-    llama_free( ctx );
+    llama_free(ctx);
-    llama_free_model( model );
+    llama_free_model(model);
    llama_backend_free();
    return 0;
 }
 // EOF
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -2,180 +2,125 @@
 #define _GNU_SOURCE
 #endif
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
-#include <cassert>
+#include "common.h"
-#include <cinttypes>
+#include "llama.h"
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
 #include <vector>
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+int main(int argc, char ** argv) {
 #include <signal.h>
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
 int main(int argc, char ** argv)
 {
    gpt_params params;
-    //---------------------------------
+    if (argc == 1 || argv[1][0] == '-') {
-    // Print help :
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
    //---------------------------------
    if ( argc == 1 || argv[1][0] == '-' )
    {
        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
        return 1 ;
    }
-    //---------------------------------
+    if (argc >= 2) {
    // Load parameters :
    //---------------------------------
    if ( argc >= 2 )
    {
        params.model = argv[1];
    }
-    if ( argc >= 3 )
+    if (argc >= 3) {
    {
        params.prompt = argv[2];
    }
-    if ( params.prompt.empty() )
+    if (params.prompt.empty()) {
    {
        params.prompt = "Hello my name is";
    }
-    //---------------------------------
+    // init LLM
    // Init LLM :
    //---------------------------------
    llama_backend_init(params.numa);
    llama_model * model;
    llama_context * ctx;
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if ( model == NULL )
+    if (model == NULL) {
-    {
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
        return 1;
    }
-    //---------------------------------
+    // tokenize the prompt
    // Tokenize the prompt :
    //---------------------------------
    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
-    const int max_context_size     = llama_n_ctx( ctx );
+    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_tokens_list_size = max_context_size - 4;
-    if ( (int)tokens_list.size() > max_tokens_list_size )
+    if ((int)tokens_list.size() > max_tokens_list_size) {
-    {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
             __func__ , (int)tokens_list.size() , max_tokens_list_size );
        return 1;
    }
-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");
-    // Print the tokens from the prompt :
+    for (auto id : tokens_list) {
-
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id));
    for( auto id : tokens_list )
    {
        printf( "%s" , llama_token_to_str( ctx , id ) );
    }
-    fflush(stdout);
+    fflush(stderr);
-
+    // main loop
    //---------------------------------
    // Main prediction loop :
    //---------------------------------
    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.
-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
+    while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
-    {
+        // evaluate the transformer
        //---------------------------------
        // Evaluate the tokens :
        //---------------------------------
-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
-        {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
            return 1;
        }
        tokens_list.clear();
-        //---------------------------------
+        // sample the next token
        // Select the best prediction :
        //---------------------------------
        llama_token new_token_id = 0;
-        auto logits  = llama_get_logits( ctx );
+        auto logits  = llama_get_logits(ctx);
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto n_vocab = llama_n_vocab(ctx);
        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);
-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
        }
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-        // Select it using the "Greedy sampling" method :
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
+        if (new_token_id == llama_token_eos()) {
        {
            fprintf(stderr, " [end of text]\n");
            break;
        }
-        // Print the new token :
+        // print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
+        printf("%s", llama_token_to_str(ctx, new_token_id));
-        fflush( stdout );
+        fflush(stdout);
-        // Push this new token for next evaluation :
+        // push this new token for next evaluation
-        tokens_list.push_back( new_token_id );
+        tokens_list.push_back(new_token_id);
-    } // wend of main loop
+    }
-    llama_free( ctx );
+    llama_free(ctx);
-    llama_free_model( model );
+    llama_free_model(model);
    llama_backend_free();
    return 0;
 }
 // EOF
--- a/gguf-util.h
+++ b/gguf-util.h
@ -5,7 +5,9 @@
 #ifndef GGUF_UTIL_H
 #define GGUF_UTIL_H
 #include "ggml.h"
 #include <cstdio>
 #include <cstdint>
 #include <cerrno>
@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) {
    return std::string(buf.data(), size);
 }
 template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;