server : add "/chat/completions" alias for "/v1/...` (#5722)

* Add "/chat/completions" as alias for "/v1/chat/completions" * merge to upstream master * minor : fix trailing whitespace --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-25 02:44:36 +00:00 · 2024-02-28 01:39:15 -07:00 · 2024-02-28 01:39:15 -07:00 · efc72253f7
commit efc72253f7
parent 7c4263d426
3 changed files with 115 additions and 68 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -3211,9 +3211,7 @@ int main(int argc, char **argv)
                res.set_content(models.dump(), "application/json; charset=utf-8");
            });
-
+    const auto chat_completions = [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
    // TODO: add mount point without "/v1" prefix -- how?
    svr.Post("/v1/chat/completions", [&llama, &validate_api_key, &sparams](const httplib::Request &req, httplib::Response &res)
    {
        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
        if (!validate_api_key(req, res)) {
@ -3291,7 +3289,10 @@ int main(int argc, char **argv)
            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        }
-            });
+    };
    svr.Post("/chat/completions", chat_completions);
    svr.Post("/v1/chat/completions", chat_completions);
    svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res)
            {
--- a/examples/server/tests/features/parallel.feature
+++ b/examples/server/tests/features/parallel.feature
@ -54,6 +54,28 @@ Feature: Parallel
      | disabled  | 128       |
      | enabled   | 64        |
  Scenario Outline: Multi users OAI completions compatibility no v1
    Given a system prompt You are a writer.
    And   a model tinyllama-2
    Given a prompt:
      """
      Write a very long book.
      """
    And a prompt:
      """
      Write another a poem.
      """
    And <n_predict> max tokens to predict
    And streaming is <streaming>
    Given concurrent OAI completions requests no v1
    Then the server is busy
    Then the server is idle
    Then all prompts are predicted with <n_predict> tokens
    Examples:
      | streaming | n_predict |
      | disabled  | 128       |
      | enabled   | 64        |
  Scenario:  Multi users with total number of tokens to predict exceeds the KV Cache size #3969
    Given a prompt:
      """
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -231,6 +231,7 @@ async def step_oai_chat_completions(context, api_error):
    completion = await oai_chat_completions(context.prompts.pop(),
                                            context.system_prompt,
                                            context.base_url,
                                            '/v1/chat',
                                            False,
                                            model=context.model if hasattr(context, 'model') else None,
@ -288,6 +289,28 @@ async def step_oai_chat_completions(context):
                              # user_prompt is inserted automatically
                              context.system_prompt,
                              context.base_url,
                              '/v1/chat/completions',
                              True,  # async_client
                              model=context.model
                              if hasattr(context, 'model') else None,
                              n_predict=context.n_predict
                              if hasattr(context, 'n_predict') else None,
                              enable_streaming=context.enable_streaming
                              if hasattr(context, 'enable_streaming') else None,
                              server_seed=context.server_seed
                              if hasattr(context, 'server_seed') else None,
                              user_api_key=context.user_api_key
                              if hasattr(context, 'user_api_key') else None)
@step(u'concurrent OAI completions requests no v1')
@async_run_until_complete
 async def step_oai_chat_completions(context):
    await concurrent_requests(context, oai_chat_completions,
                              # user_prompt is inserted automatically
                              context.system_prompt,
                              context.base_url,
                              '/chat/completions',
                              True,  # async_client
                              model=context.model
                              if hasattr(context, 'model') else None,
@ -497,6 +520,7 @@ async def request_completion(prompt,
 async def oai_chat_completions(user_prompt,
                               system_prompt,
                               base_url,
                               base_path,
                               async_client,
                               debug=False,
                               model=None,
@ -537,7 +561,7 @@ async def oai_chat_completions(user_prompt,
        origin = 'llama.cpp'
        headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin}
        async with aiohttp.ClientSession() as session:
-            async with session.post(f'{base_url}/v1/chat/completions',
+            async with session.post(f'{base_url}{base_path}',
                                    json=payload,
                                    headers=headers) as response:
                if enable_streaming:
@ -579,7 +603,7 @@ async def oai_chat_completions(user_prompt,
    else:
        try:
            openai.api_key = user_api_key
-            openai.api_base = f'{base_url}/v1/chat'
+            openai.api_base = f'{base_url}{base_path}'
            chat_completion = openai.Completion.create(
                messages=payload['messages'],
                model=model,