server : add OAI compat for /v1/completions (#10974)
Some checks failed
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Python check requirements.txt / check-requirements (push) Has been cancelled

* server : add OAI compat for /v1/completions

* add test

* add docs

* better docs
This commit is contained in:
Xuan Son Nguyen 2024-12-31 12:34:13 +01:00 committed by GitHub
parent bc7b1f8632
commit 5896c65232
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 400 additions and 146 deletions

View File

@ -345,7 +345,7 @@ node index.js
> [!IMPORTANT] > [!IMPORTANT]
> >
> This endpoint is **not** OAI-compatible > This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/completions` instead.
*Options:* *Options:*
@ -523,6 +523,7 @@ These words will not be included in the completion, so make sure to add them to
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt - `tokens_evaluated`: Number of tokens evaluated in total from the prompt
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`) - `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
### POST `/tokenize`: Tokenize a given text ### POST `/tokenize`: Tokenize a given text
*Options:* *Options:*
@ -574,6 +575,10 @@ With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
### POST `/embedding`: Generate embedding of a given text ### POST `/embedding`: Generate embedding of a given text
> [!IMPORTANT]
>
> This endpoint is **not** OAI-compatible. For OAI-compatible client, use `/v1/embeddings` instead.
The same as [the embedding example](../embedding) does. The same as [the embedding example](../embedding) does.
*Options:* *Options:*
@ -744,96 +749,6 @@ To use this endpoint with POST method, you need to start server with `--props`
- None yet - None yet
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
*Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
*Examples:*
You can use either Python `openai` library with appropriate checkpoints:
```python
import openai
client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
{"role": "user", "content": "Write a limerick about python exceptions"}
]
)
print(completion.choices[0].message)
```
... or raw HTTP requests:
```shell
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
},
{
"role": "user",
"content": "Write a limerick about python exceptions"
}
]
}'
```
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
*Options:*
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
*Examples:*
- input as string
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": "hello",
"model":"GPT-4",
"encoding_format": "float"
}'
```
- `input` as string array
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": ["hello", "world"],
"model":"GPT-4",
"encoding_format": "float"
}'
```
### POST `/embeddings`: non-OpenAI-compatible embeddings API ### POST `/embeddings`: non-OpenAI-compatible embeddings API
This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm.
@ -1064,6 +979,161 @@ To know the `id` of the adapter, use GET `/lora-adapters`
] ]
``` ```
## OpenAI-compatible API Endpoints
### GET `/v1/models`: OpenAI-compatible Model Info API
Returns information about the loaded model. See [OpenAI Models API documentation](https://platform.openai.com/docs/api-reference/models).
The returned list always has one single element.
By default, model `id` field is the path to model file, specified via `-m`. You can set a custom value for model `id` field via `--alias` argument. For example, `--alias gpt-4o-mini`.
Example:
```json
{
"object": "list",
"data": [
{
"id": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
"object": "model",
"created": 1735142223,
"owned_by": "llamacpp",
"meta": {
"vocab_type": 2,
"n_vocab": 128256,
"n_ctx_train": 131072,
"n_embd": 4096,
"n_params": 8030261312,
"size": 4912898304
}
}
]
}
```
### POST `/v1/completions`: OpenAI-compatible Completions API
Given an input `prompt`, it returns the predicted completion. Streaming mode is also supported. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps.
*Options:*
See [OpenAI Completions API documentation](https://platform.openai.com/docs/api-reference/completions).
llama.cpp `/completion`-specific features such as `mirostat` are supported.
*Examples:*
Example usage with `openai` python library:
```python
import openai
client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)
completion = client.completions.create(
model="davinci-002",
prompt="I believe the meaning of life is",
max_tokens=8
)
print(completion.choices[0].text)
```
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
*Options:*
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported.
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
*Examples:*
You can use either Python `openai` library with appropriate checkpoints:
```python
import openai
client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
{"role": "user", "content": "Write a limerick about python exceptions"}
]
)
print(completion.choices[0].message)
```
... or raw HTTP requests:
```shell
curl http://localhost:8080/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
},
{
"role": "user",
"content": "Write a limerick about python exceptions"
}
]
}'
```
### POST `/v1/embeddings`: OpenAI-compatible embeddings API
This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm.
*Options:*
See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings).
*Examples:*
- input as string
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": "hello",
"model":"GPT-4",
"encoding_format": "float"
}'
```
- `input` as string array
```shell
curl http://localhost:8080/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer no-key" \
-d '{
"input": ["hello", "world"],
"model":"GPT-4",
"encoding_format": "float"
}'
```
## More examples ## More examples
### Interactive mode ### Interactive mode

View File

@ -67,6 +67,13 @@ enum server_task_type {
SERVER_TASK_TYPE_SET_LORA, SERVER_TASK_TYPE_SET_LORA,
}; };
enum oaicompat_type {
OAICOMPAT_TYPE_NONE,
OAICOMPAT_TYPE_CHAT,
OAICOMPAT_TYPE_COMPLETION,
OAICOMPAT_TYPE_EMBEDDING,
};
// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
enum error_type { enum error_type {
ERROR_TYPE_INVALID_REQUEST, ERROR_TYPE_INVALID_REQUEST,
@ -101,11 +108,10 @@ struct slot_params {
struct common_params_speculative speculative; struct common_params_speculative speculative;
// OAI-compat fields // OAI-compat fields
bool verbose = false; bool verbose = false;
bool oaicompat = false; oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
bool oaicompat_chat = true; std::string oaicompat_model;
std::string oaicompat_model; std::string oaicompat_cmpl_id;
std::string oaicompat_cmpl_id;
json to_json() const { json to_json() const {
std::vector<std::string> samplers; std::vector<std::string> samplers;
@ -529,11 +535,10 @@ struct server_task_result_cmpl_final : server_task_result {
slot_params generation_params; slot_params generation_params;
// OAI-compat fields // OAI-compat fields
bool verbose = false; bool verbose = false;
bool oaicompat = false; oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
bool oaicompat_chat = true; // TODO: support oaicompat for non-chat std::string oaicompat_model;
std::string oaicompat_model; std::string oaicompat_cmpl_id;
std::string oaicompat_cmpl_id;
virtual int get_index() override { virtual int get_index() override {
return index; return index;
@ -544,9 +549,16 @@ struct server_task_result_cmpl_final : server_task_result {
} }
virtual json to_json() override { virtual json to_json() override {
return oaicompat switch (oaicompat) {
? (stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat()) case OAICOMPAT_TYPE_NONE:
: to_json_non_oaicompat(); return to_json_non_oaicompat();
case OAICOMPAT_TYPE_COMPLETION:
return to_json_oaicompat();
case OAICOMPAT_TYPE_CHAT:
return stream ? to_json_oaicompat_chat_stream() : to_json_oaicompat_chat();
default:
GGML_ASSERT(false && "Invalid oaicompat_type");
}
} }
json to_json_non_oaicompat() { json to_json_non_oaicompat() {
@ -574,6 +586,50 @@ struct server_task_result_cmpl_final : server_task_result {
return response_fields.empty() ? res : json_get_nested_values(response_fields, res); return response_fields.empty() ? res : json_get_nested_values(response_fields, res);
} }
json to_json_oaicompat() {
std::time_t t = std::time(0);
json logprobs = json(nullptr); // OAI default to null
if (!stream && probs_output.size() > 0) {
logprobs = json{
{"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)},
};
}
json finish_reason = "length";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
finish_reason = "stop";
}
json res = json {
{"choices", json::array({
json{
{"text", stream ? "" : content}, // in stream mode, content is already in last partial chunk
{"index", index},
{"logprobs", logprobs},
{"finish_reason", finish_reason},
}
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "text_completion"},
{"usage", json {
{"completion_tokens", n_decoded},
{"prompt_tokens", n_prompt_tokens},
{"total_tokens", n_decoded + n_prompt_tokens}
}},
{"id", oaicompat_cmpl_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
return res;
}
json to_json_oaicompat_chat() { json to_json_oaicompat_chat() {
std::string finish_reason = "length"; std::string finish_reason = "length";
if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
@ -671,11 +727,10 @@ struct server_task_result_cmpl_partial : server_task_result {
result_timings timings; result_timings timings;
// OAI-compat fields // OAI-compat fields
bool verbose = false; bool verbose = false;
bool oaicompat = false; oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
bool oaicompat_chat = true; // TODO: support oaicompat for non-chat std::string oaicompat_model;
std::string oaicompat_model; std::string oaicompat_cmpl_id;
std::string oaicompat_cmpl_id;
virtual int get_index() override { virtual int get_index() override {
return index; return index;
@ -686,7 +741,16 @@ struct server_task_result_cmpl_partial : server_task_result {
} }
virtual json to_json() override { virtual json to_json() override {
return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); switch (oaicompat) {
case OAICOMPAT_TYPE_NONE:
return to_json_non_oaicompat();
case OAICOMPAT_TYPE_COMPLETION:
return to_json_oaicompat();
case OAICOMPAT_TYPE_CHAT:
return to_json_oaicompat_chat();
default:
GGML_ASSERT(false && "Invalid oaicompat_type");
}
} }
json to_json_non_oaicompat() { json to_json_non_oaicompat() {
@ -711,6 +775,41 @@ struct server_task_result_cmpl_partial : server_task_result {
} }
json to_json_oaicompat() { json to_json_oaicompat() {
std::time_t t = std::time(0);
json logprobs = json(nullptr); // OAI default to null
if (prob_output.probs.size() > 0) {
logprobs = json{
{"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)},
};
}
json res = json {
{"choices", json::array({
json{
{"text", content},
{"index", index},
{"logprobs", logprobs},
{"finish_reason", nullptr},
}
})},
{"created", t},
{"model", oaicompat_model},
{"system_fingerprint", build_info},
{"object", "text_completion"},
{"id", oaicompat_cmpl_id}
};
// extra fields for debugging purposes
if (verbose) {
res["__verbose"] = to_json_non_oaicompat();
}
if (timings.prompt_n >= 0) {
res.push_back({"timings", timings.to_json()});
}
return res;
}
json to_json_oaicompat_chat() {
bool first = n_decoded == 0; bool first = n_decoded == 0;
std::time_t t = std::time(0); std::time_t t = std::time(0);
json choices; json choices;
@ -789,14 +888,16 @@ struct server_task_result_embd : server_task_result {
int32_t n_tokens; int32_t n_tokens;
// OAI-compat fields // OAI-compat fields
bool oaicompat = false; oaicompat_type oaicompat = OAICOMPAT_TYPE_NONE;
virtual int get_index() override { virtual int get_index() override {
return index; return index;
} }
virtual json to_json() override { virtual json to_json() override {
return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); return oaicompat == OAICOMPAT_TYPE_EMBEDDING
? to_json_oaicompat()
: to_json_non_oaicompat();
} }
json to_json_non_oaicompat() { json to_json_non_oaicompat() {
@ -2044,7 +2145,6 @@ struct server_context {
res->verbose = slot.params.verbose; res->verbose = slot.params.verbose;
res->oaicompat = slot.params.oaicompat; res->oaicompat = slot.params.oaicompat;
res->oaicompat_chat = slot.params.oaicompat_chat;
res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_model = slot.params.oaicompat_model;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
@ -2085,7 +2185,6 @@ struct server_context {
res->verbose = slot.params.verbose; res->verbose = slot.params.verbose;
res->stream = slot.params.stream; res->stream = slot.params.stream;
res->oaicompat = slot.params.oaicompat; res->oaicompat = slot.params.oaicompat;
res->oaicompat_chat = slot.params.oaicompat_chat;
res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_model = slot.params.oaicompat_model;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
@ -3506,12 +3605,11 @@ int main(int argc, char ** argv) {
// handle completion-like requests (completion, chat, infill) // handle completion-like requests (completion, chat, infill)
// we can optionally provide a custom format for partial results and final results // we can optionally provide a custom format for partial results and final results
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
server_task_type type, server_task_type type,
json & data, json & data,
httplib::Response & res, httplib::Response & res,
bool oaicompat = false, oaicompat_type oaicompat) {
bool oaicompat_chat = false) {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL); GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
if (ctx_server.params_base.embedding) { if (ctx_server.params_base.embedding) {
@ -3536,9 +3634,8 @@ int main(int argc, char ** argv) {
task.id_selected_slot = json_value(data, "id_slot", -1); task.id_selected_slot = json_value(data, "id_slot", -1);
// OAI-compat // OAI-compat
task.params.oaicompat = oaicompat; task.params.oaicompat = oaicompat;
task.params.oaicompat_chat = oaicompat_chat; task.params.oaicompat_cmpl_id = completion_id;
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl // oaicompat_model is already populated by params_from_json_cmpl
tasks.push_back(task); tasks.push_back(task);
@ -3589,7 +3686,7 @@ int main(int argc, char ** argv) {
}, [&](const json & error_data) { }, [&](const json & error_data) {
server_sent_event(sink, "error", error_data); server_sent_event(sink, "error", error_data);
}); });
if (oaicompat) { if (oaicompat != OAICOMPAT_TYPE_NONE) {
static const std::string ev_done = "data: [DONE]\n\n"; static const std::string ev_done = "data: [DONE]\n\n";
sink.write(ev_done.data(), ev_done.size()); sink.write(ev_done.data(), ev_done.size());
} }
@ -3605,17 +3702,25 @@ int main(int argc, char ** argv) {
} }
}; };
const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
json data = json::parse(req.body); json data = json::parse(req.body);
return handle_completions_generic( return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION, SERVER_TASK_TYPE_COMPLETION,
data, data,
res, res,
/* oaicompat */ false, OAICOMPAT_TYPE_NONE);
/* oaicompat_chat */ false);
}; };
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
json data = oaicompat_completion_params_parse(json::parse(req.body));
return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION,
data,
res,
OAICOMPAT_TYPE_COMPLETION);
};
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
// check model compatibility // check model compatibility
std::string err; std::string err;
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
@ -3684,22 +3789,25 @@ int main(int argc, char ** argv) {
tokenized_prompts[0] tokenized_prompts[0]
); );
return handle_completions_generic(SERVER_TASK_TYPE_INFILL, data, res); return handle_completions_impl(
SERVER_TASK_TYPE_INFILL,
data,
res,
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
}; };
const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
if (ctx_server.params_base.embedding) { if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return; return;
} }
json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); json data = oaicompat_chat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
return handle_completions_generic( return handle_completions_impl(
SERVER_TASK_TYPE_COMPLETION, SERVER_TASK_TYPE_COMPLETION,
data, data,
res, res,
/* oaicompat */ true, OAICOMPAT_TYPE_CHAT);
/* oaicompat_chat */ true);
}; };
const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { const auto handle_models = [&params, &ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
@ -3772,10 +3880,10 @@ int main(int argc, char ** argv) {
res_ok(res, data); res_ok(res, data);
}; };
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
const json body = json::parse(req.body); const json body = json::parse(req.body);
if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
return; return;
} }
@ -3785,7 +3893,7 @@ int main(int argc, char ** argv) {
if (body.count("input") != 0) { if (body.count("input") != 0) {
prompt = body.at("input"); prompt = body.at("input");
} else if (body.contains("content")) { } else if (body.contains("content")) {
oaicompat = false; oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
prompt = body.at("content"); prompt = body.at("content");
} else { } else {
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
@ -3854,16 +3962,18 @@ int main(int argc, char ** argv) {
} }
// write JSON response // write JSON response
json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
? format_embeddings_response_oaicompat(body, responses, use_base64)
: json(responses);
res_ok(res, root); res_ok(res, root);
}; };
const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
handle_embeddings_impl(req, res, false); handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
}; };
const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
handle_embeddings_impl(req, res, true); handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
}; };
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
@ -4033,7 +4143,7 @@ int main(int argc, char ** argv) {
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check) svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
svr->Post("/completion", handle_completions); // legacy svr->Post("/completion", handle_completions); // legacy
svr->Post("/completions", handle_completions); svr->Post("/completions", handle_completions);
svr->Post("/v1/completions", handle_completions); svr->Post("/v1/completions", handle_completions_oai);
svr->Post("/chat/completions", handle_chat_completions); svr->Post("/chat/completions", handle_chat_completions);
svr->Post("/v1/chat/completions", handle_chat_completions); svr->Post("/v1/chat/completions", handle_chat_completions);
svr->Post("/infill", handle_infill); svr->Post("/infill", handle_infill);

View File

@ -83,7 +83,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
def test_chat_completion_with_openai_library(): def test_chat_completion_with_openai_library():
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create( res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct", model="gpt-3.5-turbo-instruct",
messages=[ messages=[
@ -170,7 +170,7 @@ def test_chat_completion_with_timings_per_token():
def test_logprobs(): def test_logprobs():
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create( res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct", model="gpt-3.5-turbo-instruct",
temperature=0.0, temperature=0.0,
@ -197,7 +197,7 @@ def test_logprobs():
def test_logprobs_stream(): def test_logprobs_stream():
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.chat.completions.create( res = client.chat.completions.create(
model="gpt-3.5-turbo-instruct", model="gpt-3.5-turbo-instruct",
temperature=0.0, temperature=0.0,

View File

@ -1,5 +1,6 @@
import pytest import pytest
import time import time
from openai import OpenAI
from utils import * from utils import *
server = ServerPreset.tinyllama2() server = ServerPreset.tinyllama2()
@ -85,6 +86,40 @@ def test_completion_stream_vs_non_stream():
assert content_stream == res_non_stream.body["content"] assert content_stream == res_non_stream.body["content"]
def test_completion_stream_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.completions.create(
model="davinci-002",
prompt="I believe the meaning of life is",
max_tokens=8,
)
assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b")
assert res.choices[0].finish_reason == "length"
assert res.choices[0].text is not None
assert match_regex("(going|bed)+", res.choices[0].text)
def test_completion_with_openai_library():
global server
server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
res = client.completions.create(
model="davinci-002",
prompt="I believe the meaning of life is",
max_tokens=8,
stream=True,
)
output_text = ''
for data in res:
choice = data.choices[0]
if choice.finish_reason is None:
assert choice.text is not None
output_text += choice.text
assert match_regex("(going|bed)+", output_text)
@pytest.mark.parametrize("n_slots", [1, 2]) @pytest.mark.parametrize("n_slots", [1, 2])
def test_consistent_result_same_seed(n_slots: int): def test_consistent_result_same_seed(n_slots: int):
global server global server

View File

@ -549,10 +549,49 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
// OAI utils // OAI utils
// //
static json oaicompat_completion_params_parse( static json oaicompat_completion_params_parse(const json & body) {
const struct llama_model * model, json llama_params;
const json & body, /* openai api json semantics */
const std::string & chat_template) { if (!body.contains("prompt")) {
throw std::runtime_error("\"prompt\" is required");
}
// Handle "stop" field
if (body.contains("stop") && body.at("stop").is_string()) {
llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}
// Handle "n" field
int n_choices = json_value(body, "n", 1);
if (n_choices != 1) {
throw std::runtime_error("Only one completion choice is allowed");
}
// Params supported by OAI but unsupported by llama.cpp
static const std::vector<std::string> unsupported_params { "best_of", "echo", "suffix" };
for (const auto & param : unsupported_params) {
if (body.contains(param)) {
throw std::runtime_error("Unsupported param: " + param);
}
}
// Copy remaining properties to llama_params
for (const auto & item : body.items()) {
// Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens"
if (!llama_params.contains(item.key()) || item.key() == "n_predict") {
llama_params[item.key()] = item.value();
}
}
return llama_params;
}
static json oaicompat_chat_completion_params_parse(
const struct llama_model * model,
const json & body, /* openai api json semantics */
const std::string & chat_template) {
json llama_params; json llama_params;
// Apply chat template to the list of messages // Apply chat template to the list of messages