mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
account for both api and web browser requests
This commit is contained in:
parent
daf64fc4a9
commit
cb13382136
@ -2591,11 +2591,15 @@ int main(int argc, char ** argv) {
|
|||||||
return false;
|
return false;
|
||||||
};
|
};
|
||||||
|
|
||||||
auto middleware_server_state = [&state](const httplib::Request &, httplib::Response & res) {
|
auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
|
||||||
server_state current_state = state.load();
|
server_state current_state = state.load();
|
||||||
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
||||||
res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
|
if(req.path == "/"){
|
||||||
res.status = 503;
|
res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
|
||||||
|
res.status = 503;
|
||||||
|
} else {
|
||||||
|
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
@ -105,8 +105,16 @@ Feature: llama.cpp server
|
|||||||
Given first token is removed
|
Given first token is removed
|
||||||
Then tokens can be detokenized
|
Then tokens can be detokenized
|
||||||
|
|
||||||
|
Scenario: Tokenize with pieces
|
||||||
|
When tokenizing with pieces:
|
||||||
|
"""
|
||||||
|
What is the capital of Germany?
|
||||||
|
媽
|
||||||
|
"""
|
||||||
|
Then tokens are given with pieces
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
Then 1 models are supported
|
Then 1 models are supported
|
||||||
Then model 0 is identified by tinyllama-2
|
Then model 0 is identified by tinyllama-2
|
||||||
Then model 0 is trained on 128 tokens context
|
Then model 0 is trained on 128 tokens context
|
@ -1208,6 +1208,7 @@ async def wait_for_slots_status(context,
|
|||||||
while True:
|
while True:
|
||||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
||||||
status_code = slots_response.status
|
status_code = slots_response.status
|
||||||
|
print(await slots_response.text())
|
||||||
slots = await slots_response.json()
|
slots = await slots_response.json()
|
||||||
if context.debug:
|
if context.debug:
|
||||||
print(f"slots responses {slots}\n")
|
print(f"slots responses {slots}\n")
|
||||||
@ -1372,4 +1373,4 @@ def start_server_background(context):
|
|||||||
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
|
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
|
||||||
thread_stderr.start()
|
thread_stderr.start()
|
||||||
|
|
||||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
Loading…
Reference in New Issue
Block a user