mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-13 04:00:16 +00:00
account for both api and web browser requests
This commit is contained in:
parent
daf64fc4a9
commit
cb13382136
@ -2591,11 +2591,15 @@ int main(int argc, char ** argv) {
|
||||
return false;
|
||||
};
|
||||
|
||||
auto middleware_server_state = [&state](const httplib::Request &, httplib::Response & res) {
|
||||
auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
|
||||
server_state current_state = state.load();
|
||||
if (current_state == SERVER_STATE_LOADING_MODEL) {
|
||||
res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
if(req.path == "/"){
|
||||
res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
|
||||
res.status = 503;
|
||||
} else {
|
||||
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -105,8 +105,16 @@ Feature: llama.cpp server
|
||||
Given first token is removed
|
||||
Then tokens can be detokenized
|
||||
|
||||
Scenario: Tokenize with pieces
|
||||
When tokenizing with pieces:
|
||||
"""
|
||||
What is the capital of Germany?
|
||||
媽
|
||||
"""
|
||||
Then tokens are given with pieces
|
||||
|
||||
Scenario: Models available
|
||||
Given available models
|
||||
Then 1 models are supported
|
||||
Then model 0 is identified by tinyllama-2
|
||||
Then model 0 is trained on 128 tokens context
|
||||
Then model 0 is trained on 128 tokens context
|
@ -1208,6 +1208,7 @@ async def wait_for_slots_status(context,
|
||||
while True:
|
||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
||||
status_code = slots_response.status
|
||||
print(await slots_response.text())
|
||||
slots = await slots_response.json()
|
||||
if context.debug:
|
||||
print(f"slots responses {slots}\n")
|
||||
@ -1372,4 +1373,4 @@ def start_server_background(context):
|
||||
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
|
||||
thread_stderr.start()
|
||||
|
||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
Loading…
Reference in New Issue
Block a user