From 78203641fee3b1f82abaff0c7f667e1b4a286390 Mon Sep 17 00:00:00 2001 From: Mathijs Henquet Date: Thu, 12 Sep 2024 22:30:11 +0200 Subject: [PATCH] server : Add option to return token pieces in /tokenize endpoint (#9108) * server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen --- .github/workflows/server.yml | 1 + examples/server/README.md | 39 ++++++++++++++++++- examples/server/server.cpp | 33 ++++++++++++++-- examples/server/tests/features/server.feature | 8 ++++ examples/server/tests/features/steps/steps.py | 29 ++++++++++++++ examples/server/utils.hpp | 35 ++++++++++++++++- 6 files changed, 139 insertions(+), 6 deletions(-) diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 99feb28f2..29f8fd444 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -173,6 +173,7 @@ jobs: if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} run: | cd examples/server/tests + $env:PYTHONIOENCODING = ":replace" behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp - name: Slow tests diff --git a/examples/server/README.md b/examples/server/README.md index 79196e9c1..44a73ca0a 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`. *Options:* - `content`: Set the text to tokenize. + `content`: (Required) The text to tokenize. - `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` + + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` + +**Response:** + +Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise. + + +If `with_pieces` is `false`: +```json +{ + "tokens": [123, 456, 789] +} +``` + +If `with_pieces` is `true`: +```json +{ + "tokens": [ + {"id": 123, "piece": "Hello"}, + {"id": 456, "piece": " world"}, + {"id": 789, "piece": "!"} + ] +} +``` + +With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k +```json +{ + "tokens": [ + {"id": 198, "piece": [195]}, // hex C3 + {"id": 164, "piece": [161]} // hex A1 + ] +} +``` ### POST `/detokenize`: Convert tokens to text diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5b263f646..5e4dffadf 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) { const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - std::vector tokens; + json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); - tokens = ctx_server.tokenize(body.at("content"), add_special); + const bool with_pieces = json_value(body, "with_pieces", false); + std::vector tokens = ctx_server.tokenize(body.at("content"), add_special); + + if (with_pieces) { + for (const auto& token : tokens) { + std::string piece = llama_token_to_piece(ctx_server.ctx, token); + json piece_json; + + // Check if the piece is valid UTF-8 + if (is_valid_utf8(piece)) { + piece_json = piece; + } else { + // If not valid UTF-8, store as array of byte values + piece_json = json::array(); + for (unsigned char c : piece) { + piece_json.push_back(static_cast(c)); + } + } + + tokens_response.push_back({ + {"id", token}, + {"piece", piece_json} + }); + } + } else { + tokens_response = tokens; + } } - const json data = format_tokenizer_response(tokens); + + const json data = format_tokenizer_response(tokens_response); res_ok(res, data); }; diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b55971454..15e24c624 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -105,6 +105,14 @@ Feature: llama.cpp server Given first token is removed Then tokens can be detokenized + Scenario: Tokenize with pieces + When tokenizing with pieces: + """ + What is the capital of Germany? + 媽 + """ + Then tokens are given with pieces + Scenario: Models available Given available models Then 1 models are supported diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 65b71a8e8..11587dd64 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,3 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import asyncio import json import os @@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context): context.tokenize_add_special = True +@step("tokenizing with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + context.tokenized_text = context_text(context) + async with aiohttp.ClientSession() as session: + tokenize_args = {"content": context.tokenized_text, "with_pieces": True} + if getattr(context, "tokenize_add_special", None) is not None: + tokenize_args["add_special"] = context.tokenize_add_special + + async with session.post( + f"{context.base_url}/tokenize", json=tokenize_args + ) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens_with_pieces = tokenize_json["tokens"] + + +@step("tokens are given with pieces") +@async_run_until_complete +async def step_tokenize_with_pieces(context): + # Verify that the response contains both token IDs and pieces + assert all( + "id" in token and "piece" in token for token in context.tokens_with_pieces + ) + + @step('tokenizing') @async_run_until_complete async def step_tokenize(context): diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index edfce65b6..adb1a1cb9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso return res; } -static json format_tokenizer_response(const std::vector & tokens) { +static bool is_valid_utf8(const std::string & str) { + const unsigned char* bytes = reinterpret_cast(str.data()); + const unsigned char* end = bytes + str.length(); + + while (bytes < end) { + if (*bytes <= 0x7F) { + // 1-byte sequence (0xxxxxxx) + bytes++; + } else if ((*bytes & 0xE0) == 0xC0) { + // 2-byte sequence (110xxxxx 10xxxxxx) + if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80) + return false; + bytes += 2; + } else if ((*bytes & 0xF0) == 0xE0) { + // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80) + return false; + bytes += 3; + } else if ((*bytes & 0xF8) == 0xF0) { + // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) + if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 || + (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80) + return false; + bytes += 4; + } else { + // Invalid UTF-8 lead byte + return false; + } + } + + return true; +} + +static json format_tokenizer_response(const json & tokens) { return json { {"tokens", tokens} };