server : logs + minor code style

2024-12-27 20:04:35 +00:00 · 2023-10-20 20:44:51 +03:00 · 2023-10-20 20:44:51 +03:00 · 778c070d1b
commit 778c070d1b
parent 5d540e80d1
1 changed files with 62 additions and 50 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -614,13 +614,16 @@ struct llama_server_context
        // create slots
        all_slots_are_idle = true;
-        if(max_ctx_per_slot == -1) {
+        if (max_ctx_per_slot == -1)
        {
            max_ctx_per_slot = n_ctx / params.n_parallel; // split context
        }
-        if(max_ctx_per_slot * params.n_parallel > n_ctx) {
+        if (max_ctx_per_slot * params.n_parallel > n_ctx)
        {
            printf("Error: The max context per slot is more greater than model context size");
            return;
        }
        LOG_TEE("Available slots:\n");
        for (int i = 0; i < params.n_parallel; i++)
        {
@ -628,6 +631,7 @@ struct llama_server_context
            slot.id = i;
            slot.max_context_size = max_ctx_per_slot;
            slot.reset();
            LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, max_ctx_per_slot);
            slots.push_back(slot);
        }
@ -1280,11 +1284,12 @@ struct llama_server_context
        while (true) {
            std::this_thread::sleep_for(std::chrono::microseconds(5));
            std::lock_guard<std::mutex> lock(mutex_results);
            if (queue_results.empty()) {
                continue;
            }
-            for(int i = 0; i < queue_results.size(); i++) {
+            for (int i = 0; i < (int) queue_results.size(); i++) {
                if (queue_results[i].id == task_id) {
                    task_result res = queue_results[i];
                    queue_results.erase(queue_results.begin() + i);
@ -1292,7 +1297,9 @@ struct llama_server_context
                }
            }
        }
-        return task_result{-1, false, false, {}};
+
        // never reached
        //return task_result{-1, false, false, {}};
    }
    // for multiple images processing
@ -1378,7 +1385,7 @@ struct llama_server_context
            queue_tasks.erase(queue_tasks.begin());
            switch (task.type)
            {
-            case COMPLETION_TASK: { // perform completion task
+                case COMPLETION_TASK: {
                    llama_client_slot* slot = get_slot(json_value(task.data, "slot_id", -1));
                    if (slot == nullptr) {
                        LOG_TEE("slot unavailable\n");
@ -1426,6 +1433,7 @@ struct llama_server_context
        // update the system prompt wait until all slots are idle state
        if (need_update_system_prompt)
        {
            LOG_TEE("updating system prompt\n");
            update_system_prompt();
        }
@ -1435,6 +1443,7 @@ struct llama_server_context
        {
            if (system_prompt.empty() && clean_kv_cache)
            {
                LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n");
                kv_cache_clear();
            }
            // avoid 100% usage of cpu all time
@ -1449,6 +1458,7 @@ struct llama_server_context
                const int n_left    = slot.n_past - slot.params.n_keep - 1;
                const int n_discard = n_left / 2;
                LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, slot.params.n_keep, n_left, n_discard);
                llama_kv_cache_seq_rm   (ctx, slot.id, slot.params.n_keep + 1            , slot.params.n_keep + n_discard + 1);
                llama_kv_cache_seq_shift(ctx, slot.id, slot.params.n_keep + 1 + n_discard, slot.n_past, -n_discard);
@ -1463,7 +1473,7 @@ struct llama_server_context
                slot.truncated = true;
-                LOG_VERBOSE("input truncated", {
+                LOG_VERBOSE("context shift", {
                                                {"n_ctx", n_ctx},
                                                {"n_keep", params.n_keep},
                                                {"n_left", n_left},
@ -1504,6 +1514,7 @@ struct llama_server_context
            slot.n_decoded += 1;
            slot.n_past += 1;
        }
        // process in chunks of params.n_batch
        int32_t n_batch = params.n_batch;
@ -1586,9 +1597,10 @@ struct llama_server_context
                        std::copy(prompt_tokens.begin(), prompt_tokens.end(), slot.ctx_sampling->prev.end() - ps);
                        slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
                        slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past;
-                        LOG_TEE("slot %i - in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+                        LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed);
                    }
                    LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, num_tokens_system + slot.n_past);
                    llama_kv_cache_seq_rm(ctx, slot.id, num_tokens_system + slot.n_past, -1);
                    slot.cache_tokens = prompt_tokens;
@ -1596,7 +1608,7 @@ struct llama_server_context
                    if (slot.n_past == (int) slot.num_prompt_tokens)
                    {
                        // we have to evaluate at least 1 token to generate logits.
-                        printf("we have to evaluate at least 1 token to generate logits\n");
+                        LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id);
                        slot.n_past--;
                    }
@ -1606,7 +1618,7 @@ struct llama_server_context
                                                    {"to_eval", tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, slot.cache_tokens.cend())},
                                                });
-                    const bool has_images = process_images(slot); // has images?
+                    const bool has_images = process_images(slot);
                    // process the prefix of first image
                    std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
@ -1664,7 +1676,7 @@ struct llama_server_context
                    return false;
                }
-                LOG("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
+                LOG_TEE("%s : failed to find free space in the KV cache, retrying with smaller n_batch = %d\n", __func__, n_batch / 2);
                // retry with half the batch size to try to find a free slot in the KV cache
                n_batch /= 2;
@ -1705,7 +1717,7 @@ struct llama_server_context
                const int32_t n_probs = slot.sparams.n_probs;
                if (slot.sparams.temp <= 0 && n_probs > 0)
                {
-                    // For llama_sample_token_greedy we need to sort candidates
+                    // for llama_sample_token_greedy we need to sort candidates
                    llama_sample_softmax(ctx, &cur_p);
                }