mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 19:50:17 +00:00
server : clarify /slots endpoint, add is_processing (#10162)
* server : clarify /slots endpoint, add is_processing * fix tests
This commit is contained in:
parent
6a066b9978
commit
9e0ecfb697
@ -692,7 +692,10 @@ Given a ChatML-formatted json description in `messages`, it returns the predicte
|
|||||||
|
|
||||||
### GET `/slots`: Returns the current slots processing state
|
### GET `/slots`: Returns the current slots processing state
|
||||||
|
|
||||||
This endpoint can be disabled with `--no-slots`
|
> [!WARNING]
|
||||||
|
> This endpoint is intended for debugging and may be modified in future versions. For security reasons, we strongly advise against enabling it in production environments.
|
||||||
|
|
||||||
|
This endpoint is disabled by default and can be enabled with `--slots`
|
||||||
|
|
||||||
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
|
If query param `?fail_on_no_slot=1` is set, this endpoint will respond with status code 503 if there is no available slots.
|
||||||
|
|
||||||
@ -709,6 +712,7 @@ Example:
|
|||||||
"grammar": "",
|
"grammar": "",
|
||||||
"id": 0,
|
"id": 0,
|
||||||
"ignore_eos": false,
|
"ignore_eos": false,
|
||||||
|
"is_processing": false,
|
||||||
"logit_bias": [],
|
"logit_bias": [],
|
||||||
"min_p": 0.05000000074505806,
|
"min_p": 0.05000000074505806,
|
||||||
"mirostat": 0,
|
"mirostat": 0,
|
||||||
@ -741,7 +745,6 @@ Example:
|
|||||||
"temperature"
|
"temperature"
|
||||||
],
|
],
|
||||||
"seed": 42,
|
"seed": 42,
|
||||||
"state": 1,
|
|
||||||
"stop": [
|
"stop": [
|
||||||
"\n"
|
"\n"
|
||||||
],
|
],
|
||||||
@ -755,10 +758,6 @@ Example:
|
|||||||
]
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
Possible values for `slot[i].state` are:
|
|
||||||
- `0`: SLOT_STATE_IDLE
|
|
||||||
- `1`: SLOT_STATE_PROCESSING
|
|
||||||
|
|
||||||
### GET `/metrics`: Prometheus compatible metrics exporter
|
### GET `/metrics`: Prometheus compatible metrics exporter
|
||||||
|
|
||||||
This endpoint is only accessible if `--metrics` is set.
|
This endpoint is only accessible if `--metrics` is set.
|
||||||
|
@ -1568,7 +1568,7 @@ struct server_context {
|
|||||||
json slot_data = get_formated_generation(slot);
|
json slot_data = get_formated_generation(slot);
|
||||||
slot_data["id"] = slot.id;
|
slot_data["id"] = slot.id;
|
||||||
slot_data["id_task"] = slot.id_task;
|
slot_data["id_task"] = slot.id_task;
|
||||||
slot_data["state"] = slot.state;
|
slot_data["is_processing"] = slot.is_processing();
|
||||||
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
|
||||||
slot_data["next_token"] = {
|
slot_data["next_token"] = {
|
||||||
{"has_next_token", slot.has_next_token},
|
{"has_next_token", slot.has_next_token},
|
||||||
@ -1581,10 +1581,10 @@ struct server_context {
|
|||||||
{"stopping_word", slot.stopping_word},
|
{"stopping_word", slot.stopping_word},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot_data["state"] == SLOT_STATE_IDLE) {
|
if (slot.is_processing()) {
|
||||||
n_idle_slots++;
|
|
||||||
} else {
|
|
||||||
n_processing_slots++;
|
n_processing_slots++;
|
||||||
|
} else {
|
||||||
|
n_idle_slots++;
|
||||||
}
|
}
|
||||||
|
|
||||||
slots_data.push_back(slot_data);
|
slots_data.push_back(slot_data);
|
||||||
|
@ -260,13 +260,13 @@ async def step_wait_for_server_status(context, expecting_status: Literal['health
|
|||||||
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
async def step_all_slots_status(context, expected_slot_status_string: Literal['idle', 'busy'] | str):
|
||||||
match expected_slot_status_string:
|
match expected_slot_status_string:
|
||||||
case 'idle':
|
case 'idle':
|
||||||
expected_slot_status = 0
|
expected_slot_status = False
|
||||||
case 'busy':
|
case 'busy':
|
||||||
expected_slot_status = 1
|
expected_slot_status = True
|
||||||
case _:
|
case _:
|
||||||
assert False, "unknown status"
|
assert False, "unknown status"
|
||||||
|
|
||||||
expected_slots = [{'id': slot_id, 'state': expected_slot_status}
|
expected_slots = [{'id': slot_id, 'is_processing': expected_slot_status}
|
||||||
for slot_id in range(context.n_slots)]
|
for slot_id in range(context.n_slots)]
|
||||||
await request_slots_status(context, expected_slots)
|
await request_slots_status(context, expected_slots)
|
||||||
|
|
||||||
@ -1354,8 +1354,8 @@ async def wait_for_slots_status(context,
|
|||||||
if status_code == 503 and status_code == expected_http_status_code:
|
if status_code == 503 and status_code == expected_http_status_code:
|
||||||
return
|
return
|
||||||
if status_code == 200 and status_code == expected_http_status_code:
|
if status_code == 200 and status_code == expected_http_status_code:
|
||||||
n_slots_idle = sum(1 if slot["state"] == 0 else 0 for slot in slots)
|
n_slots_idle = sum(1 if not slot["is_processing"] else 0 for slot in slots)
|
||||||
n_slots_processing = sum(1 if slot["state"] != 0 else 0 for slot in slots)
|
n_slots_processing = sum(1 if slot["is_processing"] else 0 for slot in slots)
|
||||||
if ((slots_idle is None or slots_idle == n_slots_idle)
|
if ((slots_idle is None or slots_idle == n_slots_idle)
|
||||||
and (slots_processing is None or slots_processing == n_slots_processing)):
|
and (slots_processing is None or slots_processing == n_slots_processing)):
|
||||||
return
|
return
|
||||||
|
Loading…
Reference in New Issue
Block a user