simple : minor style changes

This commit is contained in:
Georgi Gerganov 2023-08-14 12:56:48 +03:00
parent 5c5a95ba2d
commit 0c19ae70d5
No known key found for this signature in database
GPG Key ID: 449E073F9DC10735
4 changed files with 92 additions and 202 deletions

View File

@ -2,17 +2,18 @@
import gguf
import gguf_namemap as tmap
import os
import sys
import struct
import json
import numpy as np
import torch
from typing import Any, List
from pathlib import Path
import torch
from sentencepiece import SentencePieceProcessor
#NDArray = np.ndarray[Any, Any]
# compatible with python < 3.9
NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
@ -225,7 +226,7 @@ for part_name in part_names:
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:
@ -268,7 +269,6 @@ for part_name in part_names:
for name in model_part.keys():
data = model_part[name]
old_dtype = data.dtype
# we don't need these
@ -295,7 +295,7 @@ for part_name in part_names:
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data.dtype == np.float16:

View File

@ -6,177 +6,121 @@
#include "gguf-llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
int main(int argc, char ** argv) {
gpt_params params;
//---------------------------------
// Print help :
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
if (argc >= 2) {
params.model = argv[1];
}
if ( argc >= 3 )
{
if (argc >= 3) {
params.prompt = argv[2];
}
if ( params.prompt.empty() )
{
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
//---------------------------------
// Init LLM :
//---------------------------------
// init LLM
llama_backend_init(params.numa);
llama_context_params ctx_params = llama_context_default_params();
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
//---------------------------------
// Tokenize the prompt :
//---------------------------------
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx( ctx );
const int max_tokens_list_size = max_context_size - 4 ;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ( (int)tokens_list.size() > max_tokens_list_size )
{
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
if ((int)tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}
fprintf( stderr, "\n\n" );
fprintf(stderr, "\n\n");
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
fflush(stderr);
//---------------------------------
// Main prediction loop :
//---------------------------------
// main loop
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
while (llama_get_kv_cache_token_count(ctx) < max_context_size) {
// evaluate the transformer
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
// sample the next token
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab );
candidates.reserve(n_vocab);
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method :
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ?
if ( new_token_id == llama_token_eos() )
{
if (new_token_id == llama_token_eos()) {
fprintf(stderr, " [end of text]\n");
break;
}
// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
fflush( stdout );
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id));
fflush(stdout);
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
} // wend of main loop
}
llama_free( ctx );
llama_free_model( model );
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}
// EOF

View File

@ -2,180 +2,125 @@
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
int main(int argc, char ** argv) {
gpt_params params;
//---------------------------------
// Print help :
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
if (argc >= 2) {
params.model = argv[1];
}
if ( argc >= 3 )
{
if (argc >= 3) {
params.prompt = argv[2];
}
if ( params.prompt.empty() )
{
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
//---------------------------------
// Init LLM :
//---------------------------------
// init LLM
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx( ctx );
const int max_tokens_list_size = max_context_size - 4 ;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ( (int)tokens_list.size() > max_tokens_list_size )
{
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
if ((int)tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}
fprintf( stderr, "\n\n" );
fprintf(stderr, "\n\n");
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
fflush(stderr);
//---------------------------------
// Main prediction loop :
//---------------------------------
// main loop
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
while (llama_get_kv_cache_token_count( ctx ) < max_context_size) {
// evaluate the transformer
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
// sample the next token
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab );
candidates.reserve(n_vocab);
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method :
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ?
if ( new_token_id == llama_token_eos() )
{
if (new_token_id == llama_token_eos()) {
fprintf(stderr, " [end of text]\n");
break;
}
// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
fflush( stdout );
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id));
fflush(stdout);
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
} // wend of main loop
}
llama_free( ctx );
llama_free_model( model );
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
return 0;
}
// EOF

View File

@ -5,7 +5,9 @@
#ifndef GGUF_UTIL_H
#define GGUF_UTIL_H
#include "ggml.h"
#include <cstdio>
#include <cstdint>
#include <cerrno>
@ -62,7 +64,6 @@ static std::string format(const char * fmt, ...) {
return std::string(buf.data(), size);
}
template<typename T>
static std::string to_string(const T & val) {
std::stringstream ss;