mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-09-22 21:16:20 +00:00
Merge d2c0111da1
into d09770cae7
This commit is contained in:
commit
9d82697e30
@ -249,23 +249,23 @@ mkdir llama-client
|
||||
cd llama-client
|
||||
```
|
||||
|
||||
Create a index.js file and put this inside:
|
||||
Create an index.js file and put this inside:
|
||||
|
||||
```javascript
|
||||
const prompt = `Building a website can be done in 10 simple steps:`;
|
||||
const prompt = `Building a website can be done in 10 simple steps:`
|
||||
|
||||
async function Test() {
|
||||
async function test() {
|
||||
let response = await fetch("http://127.0.0.1:8080/completion", {
|
||||
method: 'POST',
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
prompt,
|
||||
n_predict: 512,
|
||||
n_predict: 64,
|
||||
})
|
||||
})
|
||||
console.log((await response.json()).content)
|
||||
}
|
||||
|
||||
Test()
|
||||
test()
|
||||
```
|
||||
|
||||
And run it:
|
||||
@ -274,6 +274,33 @@ And run it:
|
||||
node index.js
|
||||
```
|
||||
|
||||
Alternative script to test streaming mode (chunk splitting and error handling should be enhanced for production use):
|
||||
|
||||
```javascript
|
||||
(async () => {
|
||||
const response = await fetch("http://localhost:8080/completion", {
|
||||
method: "POST",
|
||||
body: JSON.stringify({
|
||||
prompt: "To write an essay quickly",
|
||||
n_predict: 256,
|
||||
stream: true
|
||||
})
|
||||
})
|
||||
for await (const chunk of response.body.pipeThrough(new TextDecoderStream("utf-8"))) {
|
||||
for (const event of chunk.split(/(?<=\n\n)/v)) {
|
||||
if (event.startsWith("error")) {
|
||||
break
|
||||
}
|
||||
const data = JSON.parse(event.substring(6))
|
||||
if (data.stop) {
|
||||
break
|
||||
}
|
||||
process.stdout.write(data.content)
|
||||
}
|
||||
}
|
||||
})()
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### GET `/health`: Returns heath check result
|
||||
@ -314,7 +341,7 @@ node index.js
|
||||
`n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. The number excludes the BOS token.
|
||||
By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt.
|
||||
|
||||
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
|
||||
`stream`: Allows receiving each predicted token in real-time instead of waiting for the completion to finish (uses a different response format). To enable this, set to `true`.
|
||||
|
||||
`stop`: Specify a JSON array of stopping strings.
|
||||
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
|
||||
@ -402,6 +429,16 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||
- `tokens_evaluated`: Number of tokens evaluated in total from the prompt
|
||||
- `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
|
||||
|
||||
In streaming mode, response chunks currently use the following format, with chunks separated by `\n\n`:
|
||||
|
||||
```
|
||||
data: {"content":" token","stop":false,"id_slot":0,"multimodal":false,"index":0}
|
||||
|
||||
data: {"content":",","stop":false,"id_slot":0,"multimodal":false,"index":0}
|
||||
```
|
||||
|
||||
Although this resembles the [Server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events) standard, the `EventSource` interface cannot be used due to its lack of `POST` request support.
|
||||
|
||||
### POST `/tokenize`: Tokenize a given text
|
||||
|
||||
*Options:*
|
||||
|
@ -296,7 +296,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
|
||||
const std::string str =
|
||||
std::string(event) + ": " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
|
||||
"\n\n"; // The server-sent events standard requires each event to end with 2 newlines.
|
||||
|
||||
LOG_DBG("data stream, to_send: %s", str.c_str());
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user