simple : minor style changes

This commit is contained in:
Georgi Gerganov 2023-08-14 12:56:48 +03:00
parent 5c5a95ba2d
commit 0c19ae70d5
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 92 additions and 202 deletions

View File

@ -2,17 +2,18 @@
import gguf import gguf
import gguf_namemap as tmap import gguf_namemap as tmap
import os import os
import sys import sys
import struct import struct
import json import json
import numpy as np import numpy as np
import torch
from typing import Any, List from typing import Any, List
from pathlib import Path from pathlib import Path
import torch
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any] #NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9 # compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -225,7 +226,7 @@ for part_name in part_names:
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16: if ftype == 0 and data.dtype == np.float16:
@ -268,7 +269,6 @@ for part_name in part_names:
for name in model_part.keys(): for name in model_part.keys():
data = model_part[name] data = model_part[name]
old_dtype = data.dtype old_dtype = data.dtype
# we don't need these # we don't need these
@ -295,7 +295,7 @@ for part_name in part_names:
sys.exit() sys.exit()
n_dims = len(data.shape) n_dims = len(data.shape)
data_dtype = data.dtype data_dtype = data.dtype
# if f32 desired, convert any float16 to float32 # if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16: if ftype == 0 and data.dtype == np.float16:

View File

@ -6,177 +6,121 @@
#include "gguf-llama.h" #include "gguf-llama.h"
#include "build-info.h" #include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) int main(int argc, char ** argv) {
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
gpt_params params; gpt_params params;
//--------------------------------- if (argc == 1 || argv[1][0] == '-') {
// Print help : printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
return 1 ; return 1 ;
} }
//--------------------------------- if (argc >= 2) {
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
params.model = argv[1]; params.model = argv[1];
} }
if ( argc >= 3 ) if (argc >= 3) {
{
params.prompt = argv[2]; params.prompt = argv[2];
} }
if ( params.prompt.empty() ) if (params.prompt.empty()) {
{
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
//--------------------------------- // init LLM
// Init LLM :
//---------------------------------
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_default_params();
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if ( model == NULL ) if (model == NULL) {
{ fprintf(stderr , "%s: error: unable to load model\n" , __func__);
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1; return 1;
} }
llama_context * ctx = llama_new_context_with_model(model, ctx_params); llama_context * ctx = llama_new_context_with_model(model, ctx_params);
//--------------------------------- // tokenize the prompt
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true ); tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx( ctx ); const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4 ; const int max_tokens_list_size = max_context_size - 4;
if ( (int)tokens_list.size() > max_tokens_list_size ) if ((int)tokens_list.size() > max_tokens_list_size) {
{ fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
return 1; return 1;
} }
fprintf( stderr, "\n\n" ); fprintf(stderr, "\n\n");
// Print the tokens from the prompt : for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
} }
fflush(stdout); fflush(stderr);
// main loop
//---------------------------------
// Main prediction loop :
//---------------------------------
// The LLM keeps a contextual cache memory of previous token evaluation. // The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous // Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected. // example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
{ // evaluate the transformer
//---------------------------------
// Evaluate the tokens :
//---------------------------------
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
{ fprintf(stderr, "%s : failed to eval\n", __func__);
fprintf( stderr, "%s : failed to eval\n" , __func__ );
return 1; return 1;
} }
tokens_list.clear(); tokens_list.clear();
//--------------------------------- // sample the next token
// Select the best prediction :
//---------------------------------
llama_token new_token_id = 0; llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx ); auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab ); candidates.reserve(n_vocab);
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
{ candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method : new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
// is it an end of stream ? // is it an end of stream ?
if ( new_token_id == llama_token_eos() ) if (new_token_id == llama_token_eos()) {
{
fprintf(stderr, " [end of text]\n"); fprintf(stderr, " [end of text]\n");
break; break;
} }
// Print the new token : // print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); printf("%s", llama_token_to_str(ctx, new_token_id));
fflush( stdout ); fflush(stdout);
// Push this new token for next evaluation : // push this new token for next evaluation
tokens_list.push_back( new_token_id ); tokens_list.push_back(new_token_id);
} // wend of main loop }
llama_free( ctx ); llama_free(ctx);
llama_free_model( model ); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }
// EOF

View File

@ -2,180 +2,125 @@
#define _GNU_SOURCE #define _GNU_SOURCE
#endif #endif
#include "common.h"
#include "llama.h"
#include "build-info.h" #include "build-info.h"
#include <cassert> #include "common.h"
#include <cinttypes> #include "llama.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string> #include <string>
#include <vector> #include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) int main(int argc, char ** argv) {
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
gpt_params params; gpt_params params;
//--------------------------------- if (argc == 1 || argv[1][0] == '-') {
// Print help : printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
return 1 ; return 1 ;
} }
//--------------------------------- if (argc >= 2) {
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
params.model = argv[1]; params.model = argv[1];
} }
if ( argc >= 3 ) if (argc >= 3) {
{
params.prompt = argv[2]; params.prompt = argv[2];
} }
if ( params.prompt.empty() ) if (params.prompt.empty()) {
{
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
//--------------------------------- // init LLM
// Init LLM :
//---------------------------------
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_model * model; llama_model * model;
llama_context * ctx; llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params ); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if ( model == NULL ) if (model == NULL) {
{ fprintf(stderr, "%s: error: unable to load model\n", __func__);
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1; return 1;
} }
//--------------------------------- // tokenize the prompt
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list; std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true ); tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx( ctx ); const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4 ; const int max_tokens_list_size = max_context_size - 4;
if ( (int)tokens_list.size() > max_tokens_list_size ) if ((int)tokens_list.size() > max_tokens_list_size) {
{ fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
return 1; return 1;
} }
fprintf( stderr, "\n\n" ); fprintf(stderr, "\n\n");
// Print the tokens from the prompt : for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
} }
fflush(stdout); fflush(stderr);
// main loop
//---------------------------------
// Main prediction loop :
//---------------------------------
// The LLM keeps a contextual cache memory of previous token evaluation. // The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous // Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected. // example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
{ // evaluate the transformer
//---------------------------------
// Evaluate the tokens :
//---------------------------------
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
{ fprintf(stderr, "%s : failed to eval\n", __func__);
fprintf( stderr, "%s : failed to eval\n" , __func__ );
return 1; return 1;
} }
tokens_list.clear(); tokens_list.clear();
//--------------------------------- // sample the next token
// Select the best prediction :
//---------------------------------
llama_token new_token_id = 0; llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx ); auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates; std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab ); candidates.reserve(n_vocab);
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
{ candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
} }
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method : new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
// is it an end of stream ? // is it an end of stream ?
if ( new_token_id == llama_token_eos() ) if (new_token_id == llama_token_eos()) {
{
fprintf(stderr, " [end of text]\n"); fprintf(stderr, " [end of text]\n");
break; break;
} }
// Print the new token : // print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); printf("%s", llama_token_to_str(ctx, new_token_id));
fflush( stdout ); fflush(stdout);
// Push this new token for next evaluation : // push this new token for next evaluation
tokens_list.push_back( new_token_id ); tokens_list.push_back(new_token_id);
} // wend of main loop }
llama_free( ctx ); llama_free(ctx);
llama_free_model( model ); llama_free_model(model);
llama_backend_free(); llama_backend_free();
return 0; return 0;
} }
// EOF

View File

@ -5,7 +5,9 @@
#ifndef GGUF_UTIL_H #ifndef GGUF_UTIL_H
#define GGUF_UTIL_H #define GGUF_UTIL_H
#include "ggml.h" #include "ggml.h"
#include <cstdio> #include <cstdio>
#include <cstdint> #include <cstdint>
#include <cerrno> #include <cerrno>
@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) {
return std::string(buf.data(), size); return std::string(buf.data(), size);
} }
template<typename T> template<typename T>
static std::string to_string(const T & val) { static std::string to_string(const T & val) {
std::stringstream ss; std::stringstream ss;