mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server : add "/chat/completions" alias for "/v1/...` (#5722)
* Add "/chat/completions" as alias for "/v1/chat/completions" * merge to upstream master * minor : fix trailing whitespace --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
7c4263d426
commit
efc72253f7
@ -3211,9 +3211,7 @@ int main(int argc, char **argv)
|
|||||||
res.set_content(models.dump(), "application/json; charset=utf-8");
|
res.set_content(models.dump(), "application/json; charset=utf-8");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
|
||||||
// TODO: add mount point without "/v1" prefix -- how?
|
|
||||||
svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
|
|
||||||
{
|
{
|
||||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||||
if (!validate_api_key(req, res)) {
|
if (!validate_api_key(req, res)) {
|
||||||
@ -3291,7 +3289,10 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
||||||
}
|
}
|
||||||
});
|
};
|
||||||
|
|
||||||
|
svr.Post("/chat/completions", chat_completions);
|
||||||
|
svr.Post("/v1/chat/completions", chat_completions);
|
||||||
|
|
||||||
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
|
@ -54,6 +54,28 @@ Feature: Parallel
|
|||||||
| disabled | 128 |
|
| disabled | 128 |
|
||||||
| enabled | 64 |
|
| enabled | 64 |
|
||||||
|
|
||||||
|
Scenario Outline: Multi users OAI completions compatibility no v1
|
||||||
|
Given a system prompt You are a writer.
|
||||||
|
And a model tinyllama-2
|
||||||
|
Given a prompt:
|
||||||
|
"""
|
||||||
|
Write a very long book.
|
||||||
|
"""
|
||||||
|
And a prompt:
|
||||||
|
"""
|
||||||
|
Write another a poem.
|
||||||
|
"""
|
||||||
|
And <n_predict> max tokens to predict
|
||||||
|
And streaming is <streaming>
|
||||||
|
Given concurrent OAI completions requests no v1
|
||||||
|
Then the server is busy
|
||||||
|
Then the server is idle
|
||||||
|
Then all prompts are predicted with <n_predict> tokens
|
||||||
|
Examples:
|
||||||
|
| streaming | n_predict |
|
||||||
|
| disabled | 128 |
|
||||||
|
| enabled | 64 |
|
||||||
|
|
||||||
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969
|
||||||
Given a prompt:
|
Given a prompt:
|
||||||
"""
|
"""
|
||||||
|
@ -231,6 +231,7 @@ async def step_oai_chat_completions(context, api_error):
|
|||||||
completion = await oai_chat_completions(context.prompts.pop(),
|
completion = await oai_chat_completions(context.prompts.pop(),
|
||||||
context.system_prompt,
|
context.system_prompt,
|
||||||
context.base_url,
|
context.base_url,
|
||||||
|
'/v1/chat',
|
||||||
False,
|
False,
|
||||||
model=context.model if hasattr(context, 'model') else None,
|
model=context.model if hasattr(context, 'model') else None,
|
||||||
|
|
||||||
@ -288,6 +289,28 @@ async def step_oai_chat_completions(context):
|
|||||||
# user_prompt is inserted automatically
|
# user_prompt is inserted automatically
|
||||||
context.system_prompt,
|
context.system_prompt,
|
||||||
context.base_url,
|
context.base_url,
|
||||||
|
'/v1/chat/completions',
|
||||||
|
True, # async_client
|
||||||
|
model=context.model
|
||||||
|
if hasattr(context, 'model') else None,
|
||||||
|
n_predict=context.n_predict
|
||||||
|
if hasattr(context, 'n_predict') else None,
|
||||||
|
enable_streaming=context.enable_streaming
|
||||||
|
if hasattr(context, 'enable_streaming') else None,
|
||||||
|
server_seed=context.server_seed
|
||||||
|
if hasattr(context, 'server_seed') else None,
|
||||||
|
user_api_key=context.user_api_key
|
||||||
|
if hasattr(context, 'user_api_key') else None)
|
||||||
|
|
||||||
|
|
||||||
|
@step(u'concurrent OAI completions requests no v1')
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_oai_chat_completions(context):
|
||||||
|
await concurrent_requests(context, oai_chat_completions,
|
||||||
|
# user_prompt is inserted automatically
|
||||||
|
context.system_prompt,
|
||||||
|
context.base_url,
|
||||||
|
'/chat/completions',
|
||||||
True, # async_client
|
True, # async_client
|
||||||
model=context.model
|
model=context.model
|
||||||
if hasattr(context, 'model') else None,
|
if hasattr(context, 'model') else None,
|
||||||
@ -497,6 +520,7 @@ async def request_completion(prompt,
|
|||||||
async def oai_chat_completions(user_prompt,
|
async def oai_chat_completions(user_prompt,
|
||||||
system_prompt,
|
system_prompt,
|
||||||
base_url,
|
base_url,
|
||||||
|
base_path,
|
||||||
async_client,
|
async_client,
|
||||||
debug=False,
|
debug=False,
|
||||||
model=None,
|
model=None,
|
||||||
@ -537,7 +561,7 @@ async def oai_chat_completions(user_prompt,
|
|||||||
origin = 'llama.cpp'
|
origin = 'llama.cpp'
|
||||||
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(f'{base_url}/v1/chat/completions',
|
async with session.post(f'{base_url}{base_path}',
|
||||||
json=payload,
|
json=payload,
|
||||||
headers=headers) as response:
|
headers=headers) as response:
|
||||||
if enable_streaming:
|
if enable_streaming:
|
||||||
@ -579,7 +603,7 @@ async def oai_chat_completions(user_prompt,
|
|||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
openai.api_key = user_api_key
|
openai.api_key = user_api_key
|
||||||
openai.api_base = f'{base_url}/v1/chat'
|
openai.api_base = f'{base_url}{base_path}'
|
||||||
chat_completion = openai.Completion.create(
|
chat_completion = openai.Completion.create(
|
||||||
messages=payload['messages'],
|
messages=payload['messages'],
|
||||||
model=model,
|
model=model,
|
||||||
|
Loading…
Reference in New Issue
Block a user