diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 733b89ccf..22343cea5 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2591,11 +2591,15 @@ int main(int argc, char ** argv) {
return false;
};
- auto middleware_server_state = [&state](const httplib::Request &, httplib::Response & res) {
+ auto middleware_server_state = [&res_error, &state](const httplib::Request & req, httplib::Response & res) {
server_state current_state = state.load();
if (current_state == SERVER_STATE_LOADING_MODEL) {
- res.set_content("
The model is loading. Please wait.
The user interface will appear soon.", "text/html; charset=utf-8");
- res.status = 503;
+ if(req.path == "/"){
+ res.set_content("The model is loading. Please wait.
The user interface will appear soon.", "text/html; charset=utf-8");
+ res.status = 503;
+ } else {
+ res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+ }
return false;
}
return true;
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index b55971454..6a3ffe317 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,8 +105,16 @@ Feature: llama.cpp server
Given first token is removed
Then tokens can be detokenized
+ Scenario: Tokenize with pieces
+ When tokenizing with pieces:
+ """
+ What is the capital of Germany?
+ 媽
+ """
+ Then tokens are given with pieces
+
Scenario: Models available
Given available models
Then 1 models are supported
Then model 0 is identified by tinyllama-2
- Then model 0 is trained on 128 tokens context
+ Then model 0 is trained on 128 tokens context
\ No newline at end of file
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 65b71a8e8..c463decf0 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1208,6 +1208,7 @@ async def wait_for_slots_status(context,
while True:
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
status_code = slots_response.status
+ print(await slots_response.text())
slots = await slots_response.json()
if context.debug:
print(f"slots responses {slots}\n")
@@ -1372,4 +1373,4 @@ def start_server_background(context):
thread_stderr = threading.Thread(target=server_log, args=(context.server_process.stderr, sys.stderr))
thread_stderr.start()
- print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
+ print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
\ No newline at end of file