mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-28 12:24:35 +00:00
0d2b66c638
refactor ggml-cuda
239 lines
6.6 KiB
C++
239 lines
6.6 KiB
C++
#include <stdio.h>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include "llama.h"
|
|
|
|
|
|
void generate_sequence(llama_context * ctx, int n_ctx, const std::vector<llama_token>& prompt_tokens, float temperature) {
|
|
// print the tokens from the prompt
|
|
for (llama_token id : prompt_tokens) {
|
|
printf("%s", llama_token_to_str(ctx, id));
|
|
}
|
|
|
|
//---------------------------------
|
|
// Load parameters :
|
|
//---------------------------------
|
|
|
|
if ( argc >= 2 )
|
|
{
|
|
params.model = argv[1];
|
|
}
|
|
|
|
if ( argc >= 3 )
|
|
{
|
|
params.prompt = argv[2];
|
|
}
|
|
|
|
if ( params.prompt.empty() )
|
|
{
|
|
params.prompt = "Hello my name is";
|
|
}
|
|
|
|
//---------------------------------
|
|
// Init LLM :
|
|
//---------------------------------
|
|
|
|
llama_backend_init(params.numa);
|
|
|
|
llama_model * model;
|
|
llama_context * ctx;
|
|
|
|
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
|
|
|
if ( model == NULL )
|
|
{
|
|
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
|
return 1;
|
|
}
|
|
|
|
//---------------------------------
|
|
// Tokenize the prompt :
|
|
//---------------------------------
|
|
|
|
std::vector<llama_token> tokens_list;
|
|
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
|
|
|
|
const int max_context_size = llama_n_ctx( ctx );
|
|
const int max_tokens_list_size = max_context_size - 4 ;
|
|
|
|
if ( (int)tokens_list.size() > max_tokens_list_size )
|
|
{
|
|
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
|
|
__func__ , (int)tokens_list.size() , max_tokens_list_size );
|
|
return 1;
|
|
}
|
|
|
|
fprintf( stderr, "\n\n" );
|
|
|
|
// Print the tokens from the prompt :
|
|
|
|
for( auto id : tokens_list )
|
|
{
|
|
printf( "%s" , llama_token_to_str( ctx , id ) );
|
|
}
|
|
|
|
fflush(stdout);
|
|
|
|
// the maximum number of tokens to generate at a time
|
|
// TODO: not supported, remove
|
|
const int CUDA_MAX_TOKENS = 1;
|
|
llama_token tokens_out[CUDA_MAX_TOKENS];
|
|
|
|
// current position in the context window
|
|
int n_past = 0;
|
|
|
|
// number of tokens to generate
|
|
int n_tokens_out;
|
|
|
|
// list of tokens to evaluate
|
|
// note that at most llama_context_params::n_batch tokens can be evaluated at a time
|
|
std::vector<llama_token> token_list = prompt_tokens;
|
|
|
|
while (n_past < n_ctx) {
|
|
// evaluate the tokens
|
|
|
|
// llama_eval generates one token at a time
|
|
n_tokens_out = 1;
|
|
|
|
// number of threads to use for CPU evaluation - ignored if compiled with CUDA support
|
|
const int n_threads = 4;
|
|
// note: llama_eval is not compatible with GPU sampling
|
|
if (llama_eval(ctx, token_list.data(), token_list.size(), n_past, n_threads)) {
|
|
fprintf(stderr, "%s : failed to eval\n", __func__ );
|
|
exit(1);
|
|
}
|
|
|
|
// perform sampling on the CPU
|
|
float * logits = llama_get_logits(ctx);
|
|
auto n_vocab = llama_n_vocab(ctx);
|
|
|
|
// initialize candidate array from logits
|
|
std::vector<llama_token_data> candidates;
|
|
candidates.reserve(n_vocab);
|
|
for(llama_token token_id = 0 ; token_id < n_vocab ; token_id++) {
|
|
candidates.push_back(llama_token_data{ token_id, logits[token_id], 0.0f});
|
|
}
|
|
|
|
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
|
|
|
// sample token
|
|
llama_sample_temperature(ctx, &candidates_p, temperature);
|
|
tokens_out[0] = llama_sample_token(ctx, &candidates_p);
|
|
|
|
// increment the position in the context window
|
|
n_past += token_list.size() + n_tokens_out - 1;
|
|
|
|
token_list.clear();
|
|
|
|
// print the new tokens
|
|
for (int i = 0; i < n_tokens_out; i++) {
|
|
llama_token new_token_id = tokens_out[i];
|
|
|
|
// is it an end of stream ?
|
|
if (new_token_id == llama_token_eos()) {
|
|
fprintf(stderr, " [end of text]\n");
|
|
//return;
|
|
}
|
|
|
|
// print the new token :
|
|
printf("%s", llama_token_to_str(ctx, new_token_id));
|
|
}
|
|
fflush(stdout);
|
|
|
|
// push the last new token for the next evaluation
|
|
token_list.push_back(tokens_out[n_tokens_out - 1]);
|
|
}
|
|
}
|
|
|
|
int main(int argc, char ** argv) {
|
|
if (argc < 2 || argv[1][0] == '-') {
|
|
printf("usage: %s <model> <n_ctx> <n_gens> <temp> [prompt]\n", argv[0]);
|
|
printf(" note: passing a temp parameter will enable GPU sampling\n");
|
|
return 1 ;
|
|
}
|
|
|
|
std::string model = argv[1];
|
|
struct llama_context_params lparams = llama_context_default_params();
|
|
|
|
if (argc >= 3) {
|
|
lparams.n_ctx = std::stoi(argv[2]);
|
|
} else {
|
|
lparams.n_ctx = 512;
|
|
}
|
|
|
|
int n_gens;
|
|
if (argc >= 4) {
|
|
n_gens = std::stoi(argv[3]);
|
|
} else {
|
|
n_gens = 1;
|
|
}
|
|
|
|
float temperature;
|
|
|
|
if (argc >= 5) {
|
|
temperature = std::stof(argv[4]);
|
|
} else {
|
|
temperature = 0.8f;
|
|
}
|
|
|
|
std::string prompt;
|
|
if (argc >= 6) {
|
|
prompt = argv[5];
|
|
} else {
|
|
prompt = "Hello my name is";
|
|
}
|
|
|
|
// initialize llama.cpp
|
|
bool numa = false;
|
|
llama_init_backend(numa);
|
|
|
|
llama_model * lmodel = llama_load_model_from_file(model.c_str(), lparams);
|
|
if (lmodel == NULL) {
|
|
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, model.c_str());
|
|
return 1;
|
|
}
|
|
|
|
llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
|
|
if (ctx == NULL) {
|
|
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, model.c_str());
|
|
llama_free_model(lmodel);
|
|
return 1;
|
|
}
|
|
|
|
// tokenize the prompt
|
|
std::vector<llama_token> token_list(lparams.n_ctx);
|
|
int prompt_tokens = llama_tokenize(ctx, prompt.c_str(), token_list.data(), token_list.size(), true);
|
|
if (prompt_tokens <= 0) {
|
|
fprintf(stderr, "%s: error: unable to tokenize prompt\n", __func__);
|
|
return 1;
|
|
}
|
|
|
|
token_list.resize(prompt_tokens);
|
|
|
|
const int max_context_size = llama_n_ctx(ctx);
|
|
const int max_tokens_list_size = max_context_size - 4 ;
|
|
|
|
if ((int)token_list.size() > max_tokens_list_size) {
|
|
fprintf( stderr, "%s: error: prompt too long (%d tokens, max %d)\n" ,
|
|
__func__, (int)token_list.size(), max_tokens_list_size );
|
|
return 1;
|
|
}
|
|
|
|
fprintf(stderr, "\n\n");
|
|
|
|
// generate the sequences
|
|
for (int i = 0; i < n_gens; i++) {
|
|
printf("==== GENERATION %d ====\n", i + 1);
|
|
generate_sequence(ctx, max_context_size, token_list, temperature);
|
|
printf("\n\n");
|
|
}
|
|
|
|
llama_print_timings(ctx);
|
|
llama_free(ctx);
|
|
|
|
llama_backend_free();
|
|
|
|
return 0;
|
|
}
|