mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-31 22:04:35 +00:00
server : add docs
This commit is contained in:
parent
00b33760aa
commit
877a04ccff
@ -1093,7 +1093,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
|||||||
}
|
}
|
||||||
).set_sparam());
|
).set_sparam());
|
||||||
add_opt(llama_arg(
|
add_opt(llama_arg(
|
||||||
{"--pooling"}, "{none,mean,cls,last, rank}",
|
{"--pooling"}, "{none,mean,cls,last,rank}",
|
||||||
"pooling type for embeddings, use model default if unspecified",
|
"pooling type for embeddings, use model default if unspecified",
|
||||||
[](gpt_params & params, const std::string & value) {
|
[](gpt_params & params, const std::string & value) {
|
||||||
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||||
|
@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
|||||||
**Features:**
|
**Features:**
|
||||||
* LLM inference of F16 and quantized models on GPU and CPU
|
* LLM inference of F16 and quantized models on GPU and CPU
|
||||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||||
|
* Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
|
||||||
* Parallel decoding with multi-user support
|
* Parallel decoding with multi-user support
|
||||||
* Continuous batching
|
* Continuous batching
|
||||||
* Multimodal (wip)
|
* Multimodal (wip)
|
||||||
@ -130,7 +131,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||||
@ -478,6 +479,39 @@ The same as [the embedding example](../embedding) does.
|
|||||||
|
|
||||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||||
|
|
||||||
|
### POST `/reranking`: Rerank documents according to a given query
|
||||||
|
|
||||||
|
Similar to https://jina.ai/reranker/ but might change in the future.
|
||||||
|
Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
`query`: The query against which the documents will be ranked.
|
||||||
|
|
||||||
|
`documents`: An array strings representing the documents to be ranked.
|
||||||
|
|
||||||
|
*Aliases:*
|
||||||
|
- `/rerank`
|
||||||
|
- `/v1/rerank`
|
||||||
|
- `/v1/reranking`
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://127.0.0.1:8012/v1/rerank \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"model": "some-model",
|
||||||
|
"query": "What is panda?",
|
||||||
|
"top_n": 3,
|
||||||
|
"documents": [
|
||||||
|
"hi",
|
||||||
|
"it is a bear",
|
||||||
|
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
|
||||||
|
]
|
||||||
|
}' | jq
|
||||||
|
```
|
||||||
|
|
||||||
### POST `/infill`: For code infilling.
|
### POST `/infill`: For code infilling.
|
||||||
|
|
||||||
Takes a prefix and a suffix and returns the predicted completion as stream.
|
Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
@ -3297,7 +3297,9 @@ int main(int argc, char ** argv) {
|
|||||||
svr->Post("/embeddings", handle_embeddings);
|
svr->Post("/embeddings", handle_embeddings);
|
||||||
svr->Post("/v1/embeddings", handle_embeddings);
|
svr->Post("/v1/embeddings", handle_embeddings);
|
||||||
svr->Post("/rerank", handle_rerank);
|
svr->Post("/rerank", handle_rerank);
|
||||||
|
svr->Post("/reranking", handle_rerank);
|
||||||
svr->Post("/v1/rerank", handle_rerank);
|
svr->Post("/v1/rerank", handle_rerank);
|
||||||
|
svr->Post("/v1/reranking", handle_rerank);
|
||||||
svr->Post("/tokenize", handle_tokenize);
|
svr->Post("/tokenize", handle_tokenize);
|
||||||
svr->Post("/detokenize", handle_detokenize);
|
svr->Post("/detokenize", handle_detokenize);
|
||||||
// LoRA adapters hotswap
|
// LoRA adapters hotswap
|
||||||
|
Loading…
Reference in New Issue
Block a user