mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-29 04:44:34 +00:00
server : add docs
This commit is contained in:
parent
00b33760aa
commit
877a04ccff
@ -1093,7 +1093,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
|
||||
}
|
||||
).set_sparam());
|
||||
add_opt(llama_arg(
|
||||
{"--pooling"}, "{none,mean,cls,last, rank}",
|
||||
{"--pooling"}, "{none,mean,cls,last,rank}",
|
||||
"pooling type for embeddings, use model default if unspecified",
|
||||
[](gpt_params & params, const std::string & value) {
|
||||
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||
|
@ -7,6 +7,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
|
||||
**Features:**
|
||||
* LLM inference of F16 and quantized models on GPU and CPU
|
||||
* [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
|
||||
* Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
|
||||
* Parallel decoding with multi-user support
|
||||
* Continuous batching
|
||||
* Multimodal (wip)
|
||||
@ -130,7 +131,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--no-context-shift` | disables context shift on inifinite text generation (default: disabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
|
||||
| `-sp, --special` | special tokens output enabled (default: false) |
|
||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||
| `--pooling {none,mean,cls,last}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
|
||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_ALIAS) |
|
||||
@ -478,6 +479,39 @@ The same as [the embedding example](../embedding) does.
|
||||
|
||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||
|
||||
### POST `/reranking`: Rerank documents according to a given query
|
||||
|
||||
Similar to https://jina.ai/reranker/ but might change in the future.
|
||||
Requires a reranker model (such as [bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3)) and the `--embedding --pooling rank` options.
|
||||
|
||||
*Options:*
|
||||
|
||||
`query`: The query against which the documents will be ranked.
|
||||
|
||||
`documents`: An array strings representing the documents to be ranked.
|
||||
|
||||
*Aliases:*
|
||||
- `/rerank`
|
||||
- `/v1/rerank`
|
||||
- `/v1/reranking`
|
||||
|
||||
*Examples:*
|
||||
|
||||
```shell
|
||||
curl http://127.0.0.1:8012/v1/rerank \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "some-model",
|
||||
"query": "What is panda?",
|
||||
"top_n": 3,
|
||||
"documents": [
|
||||
"hi",
|
||||
"it is a bear",
|
||||
"The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China."
|
||||
]
|
||||
}' | jq
|
||||
```
|
||||
|
||||
### POST `/infill`: For code infilling.
|
||||
|
||||
Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||
|
@ -3297,7 +3297,9 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/embeddings", handle_embeddings);
|
||||
svr->Post("/v1/embeddings", handle_embeddings);
|
||||
svr->Post("/rerank", handle_rerank);
|
||||
svr->Post("/reranking", handle_rerank);
|
||||
svr->Post("/v1/rerank", handle_rerank);
|
||||
svr->Post("/v1/reranking", handle_rerank);
|
||||
svr->Post("/tokenize", handle_tokenize);
|
||||
svr->Post("/detokenize", handle_detokenize);
|
||||
// LoRA adapters hotswap
|
||||
|
Loading…
Reference in New Issue
Block a user