mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-11-14 06:49:54 +00:00
server : refactor multitask handling (#9274)
* server : remove multitask from server_task * refactor completions handler * fix embeddings * use res_ok everywhere * small change for handle_slots_action * use unordered_set everywhere * (try) fix test * no more "mutable" lambda * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * use deque --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
b60074f1c2
commit
6e7d133a5f
File diff suppressed because it is too large
Load Diff
@ -52,8 +52,8 @@ Feature: Parallel
|
|||||||
Then all prompts are predicted with <n_predict> tokens
|
Then all prompts are predicted with <n_predict> tokens
|
||||||
Examples:
|
Examples:
|
||||||
| streaming | n_predict |
|
| streaming | n_predict |
|
||||||
| disabled | 128 |
|
| disabled | 200 |
|
||||||
| enabled | 64 |
|
| enabled | 200 |
|
||||||
|
|
||||||
Scenario Outline: Multi users OAI completions compatibility no v1
|
Scenario Outline: Multi users OAI completions compatibility no v1
|
||||||
Given a system prompt You are a writer.
|
Given a system prompt You are a writer.
|
||||||
|
@ -818,7 +818,7 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
|
|||||||
for prompt_no in range(context.n_prompts):
|
for prompt_no in range(context.n_prompts):
|
||||||
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
|
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
|
||||||
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
|
||||||
await asyncio.sleep(0.1)
|
await asyncio.sleep(0.01)
|
||||||
|
|
||||||
|
|
||||||
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
@step('the slot {slot_id:d} is saved with filename "{filename}"')
|
||||||
|
@ -8,9 +8,12 @@ Feature: Wrong usage of llama.cpp server
|
|||||||
Scenario: Infinite loop
|
Scenario: Infinite loop
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
|
And 42 as server seed
|
||||||
|
And 2048 KV cache size
|
||||||
# Uncomment below to fix the issue
|
# Uncomment below to fix the issue
|
||||||
#And 64 server max tokens to predict
|
#And 64 server max tokens to predict
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
Then the server is healthy
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
"""
|
"""
|
||||||
Go to: infinite loop
|
Go to: infinite loop
|
||||||
|
@ -3,6 +3,14 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifndef NDEBUG
|
||||||
|
// crash the server in debug mode, otherwise send an http 500 error
|
||||||
|
#define CPPHTTPLIB_NO_EXCEPTIONS 1
|
||||||
|
#endif
|
||||||
|
// increase max payload length to allow use of larger context size
|
||||||
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
|
#include "httplib.h"
|
||||||
|
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
@ -279,6 +287,18 @@ static size_t find_partial_stop_string(const std::string &stop, const std::strin
|
|||||||
return std::string::npos;
|
return std::string::npos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool json_is_array_of_numbers(json data) {
|
||||||
|
if (data.is_array()) {
|
||||||
|
for (const auto & e : data) {
|
||||||
|
if (!e.is_number()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: reuse llama_detokenize
|
// TODO: reuse llama_detokenize
|
||||||
template <class Iter>
|
template <class Iter>
|
||||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
@ -343,6 +363,19 @@ static json probs_vector_to_json(const llama_context * ctx, const std::vector<co
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool server_sent_event(httplib::DataSink & sink, const char * event, json & data) {
|
||||||
|
const std::string str =
|
||||||
|
std::string(event) + ": " +
|
||||||
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
|
||||||
|
return sink.write(str.c_str(), str.size());
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// OAI utils
|
// OAI utils
|
||||||
//
|
//
|
||||||
|
Loading…
Reference in New Issue
Block a user