mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-09 02:01:44 +00:00
llama : add simple-chat example
This commit is contained in:
parent
85679d37f3
commit
587c114b09
5
Makefile
5
Makefile
@ -1287,6 +1287,11 @@ llama-simple: examples/simple/simple.cpp \
|
|||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
llama-simple-chat: examples/simple-chat/simple-chat.cpp \
|
||||||
|
$(OBJ_ALL)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
llama-tokenize: examples/tokenize/tokenize.cpp \
|
llama-tokenize: examples/tokenize/tokenize.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
@ -49,6 +49,7 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
|
add_subdirectory(simple-chat)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
add_subdirectory(tokenize)
|
add_subdirectory(tokenize)
|
||||||
endif()
|
endif()
|
||||||
|
5
examples/simple-chat/CMakeLists.txt
Normal file
5
examples/simple-chat/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
set(TARGET llama-simple-chat)
|
||||||
|
add_executable(${TARGET} simple-chat.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
7
examples/simple-chat/README.md
Normal file
7
examples/simple-chat/README.md
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
# llama.cpp/example/simple-chat
|
||||||
|
|
||||||
|
The purpose of this example is to demonstrate a minimal usage of llama.cpp to create a simple chat program using the built-in chat template in GGUF files.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./llama-simple-chat -m ./models/llama-7b-v2/ggml-model-f16.gguf -c 2048
|
||||||
|
...
|
182
examples/simple-chat/simple-chat.cpp
Normal file
182
examples/simple-chat/simple-chat.cpp
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
#include "llama.h"
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
static void print_usage(int, char ** argv) {
|
||||||
|
printf("\nexample usage:\n");
|
||||||
|
printf("\n %s -m model.gguf [-c context_size] [-ngl n_gpu_layers]\n", argv[0]);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
// path to the model gguf file
|
||||||
|
std::string model_path;
|
||||||
|
// number of layers to offload to the GPU
|
||||||
|
int ngl = 99;
|
||||||
|
int n_ctx = 2048;
|
||||||
|
|
||||||
|
// parse command line arguments
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
try {
|
||||||
|
if (strcmp(argv[i], "-m") == 0) {
|
||||||
|
if (i + 1 < argc) {
|
||||||
|
model_path = argv[++i];
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (strcmp(argv[i], "-c") == 0) {
|
||||||
|
if (i + 1 < argc) {
|
||||||
|
n_ctx = std::stoi(argv[++i]);
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||||
|
if (i + 1 < argc) {
|
||||||
|
ngl = std::stoi(argv[++i]);
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
fprintf(stderr, "error: %s\n", e.what());
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (model_path.empty()) {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// only print errors
|
||||||
|
llama_log_set([](enum ggml_log_level level, const char * text, void * /* user_data */) {
|
||||||
|
if (level >= GGML_LOG_LEVEL_ERROR) {
|
||||||
|
fprintf(stderr, "%s", text);
|
||||||
|
}
|
||||||
|
}, nullptr);
|
||||||
|
|
||||||
|
// initialize the model
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
|
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||||
|
if (!model) {
|
||||||
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize the context
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
ctx_params.n_ctx = n_ctx;
|
||||||
|
ctx_params.n_batch = n_ctx;
|
||||||
|
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize the sampler
|
||||||
|
llama_sampler * smpl = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_min_p(0.05f, 1));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_temp(0.8f));
|
||||||
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
|
||||||
|
|
||||||
|
// generation helper
|
||||||
|
auto generate = [&](const std::string & prompt) {
|
||||||
|
std::string response;
|
||||||
|
|
||||||
|
// tokenize the prompt
|
||||||
|
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||||
|
std::vector<llama_token> prompt_tokens(n_prompt);
|
||||||
|
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
||||||
|
GGML_ABORT("failed to tokenize the prompt\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare a batch for the prompt
|
||||||
|
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
|
||||||
|
llama_token new_token_id;
|
||||||
|
while (true) {
|
||||||
|
// check if we have enough context space to evaluate this batch
|
||||||
|
int n_ctx = llama_n_ctx(ctx);
|
||||||
|
int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
|
||||||
|
if (n_ctx_used + batch.n_tokens > n_ctx) {
|
||||||
|
printf("\033[0m\n");
|
||||||
|
fprintf(stderr, "context size exceeded\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch)) {
|
||||||
|
GGML_ABORT("failed to eval\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// sample the next token
|
||||||
|
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
|
// is it an end of generation?
|
||||||
|
if (llama_token_is_eog(model, new_token_id)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the token to the response
|
||||||
|
char buf[128];
|
||||||
|
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
||||||
|
if (n < 0) {
|
||||||
|
GGML_ABORT("failed to convert token to piece\n");
|
||||||
|
}
|
||||||
|
std::string piece(buf, n);
|
||||||
|
response += piece;
|
||||||
|
printf("%s", piece.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
// prepare the next batch with the sampled token
|
||||||
|
batch = llama_batch_get_one(&new_token_id, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return response;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::vector<llama_chat_message> messages;
|
||||||
|
std::vector<char> formatted(2048);
|
||||||
|
int prev_len = 0;
|
||||||
|
while (true) {
|
||||||
|
std::string user;
|
||||||
|
std::getline(std::cin, user);
|
||||||
|
messages.push_back({"user", strdup(user.c_str())});
|
||||||
|
|
||||||
|
// format the messages
|
||||||
|
int new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||||
|
if (new_len > (int)formatted.size()) {
|
||||||
|
formatted.resize(new_len);
|
||||||
|
new_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), true, formatted.data(), formatted.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove previous messages and obtain a prompt
|
||||||
|
std::string prompt(formatted.begin() + prev_len, formatted.begin() + new_len);
|
||||||
|
|
||||||
|
// generate a response
|
||||||
|
printf("\033[31m");
|
||||||
|
std::string response = generate(prompt);
|
||||||
|
printf("\n\033[0m");
|
||||||
|
|
||||||
|
// add the response to the messages
|
||||||
|
messages.push_back({"assistant", strdup(response.c_str())});
|
||||||
|
prev_len = llama_chat_apply_template(model, nullptr, messages.data(), messages.size(), false, formatted.data(), formatted.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
llama_sampler_free(smpl);
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -559,10 +559,10 @@ extern "C" {
|
|||||||
|
|
||||||
enum ggml_log_level {
|
enum ggml_log_level {
|
||||||
GGML_LOG_LEVEL_NONE = 0,
|
GGML_LOG_LEVEL_NONE = 0,
|
||||||
GGML_LOG_LEVEL_INFO = 1,
|
GGML_LOG_LEVEL_DEBUG = 1,
|
||||||
GGML_LOG_LEVEL_WARN = 2,
|
GGML_LOG_LEVEL_INFO = 2,
|
||||||
GGML_LOG_LEVEL_ERROR = 3,
|
GGML_LOG_LEVEL_WARN = 3,
|
||||||
GGML_LOG_LEVEL_DEBUG = 4,
|
GGML_LOG_LEVEL_ERROR = 4,
|
||||||
GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
GGML_LOG_LEVEL_CONT = 5, // continue previous log
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user