account for both api and web browser requests

This commit is contained in:
VJHack 2024-09-12 21:44:52 -05:00
parent daf64fc4a9
commit cb13382136
3 changed files with 18 additions and 5 deletions

View File

@ -2591,11 +2591,15 @@ int main(int argc, char ** argv) {
return false; return false;
}; };
auto middleware_server_state = [&state](const httplib::Request &, httplib::Response & res) { auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load(); server_state current_state = state.load();
if (current_state == SERVER_STATE_LOADING_MODEL) { if (current_state == SERVER_STATE_LOADING_MODEL) {
res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8"); if(req.path == "/"){
res.status = 503; res.set_content("<html><body>The model is loading. Please wait.<br/>The user interface will appear soon.</body></html>", "text/html; charset=utf-8");
res.status = 503;
} else {
res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
}
return false; return false;
} }
return true; return true;

View File

@ -105,8 +105,16 @@ Feature: llama.cpp server
Given first token is removed Given first token is removed
Then tokens can be detokenized Then tokens can be detokenized
Scenario: Tokenize with pieces
When tokenizing with pieces:
"""
What is the capital of Germany?
"""
Then tokens are given with pieces
Scenario: Models available Scenario: Models available
Given available models Given available models
Then 1 models are supported Then 1 models are supported
Then model 0 is identified by tinyllama-2 Then model 0 is identified by tinyllama-2
Then model 0 is trained on 128 tokens context Then model 0 is trained on 128 tokens context

View File

@ -1208,6 +1208,7 @@ async def wait_for_slots_status(context,
while True: while True:
async with await session.get(f'{base_url}/slots', params=params) as slots_response: async with await session.get(f'{base_url}/slots', params=params) as slots_response:
status_code = slots_response.status status_code = slots_response.status
print(await slots_response.text())
slots = await slots_response.json() slots = await slots_response.json()
if context.debug: if context.debug:
print(f"slots responses {slots}\n") print(f"slots responses {slots}\n")
@ -1372,4 +1373,4 @@ def start_server_background(context):
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr)) thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
thread_stderr.start() thread_stderr.start()
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}") print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")