mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-01-12 03:31:46 +00:00
server : Add option to return token pieces in /tokenize endpoint (#9108)
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Has been cancelled
Some checks failed
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full-cuda.Dockerfile platforms:linux/amd64 tag:full-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/full.Dockerfile platforms:linux/amd64,linux/arm64 tag:full]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-cuda.Dockerfile platforms:linux/amd64 tag:light-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli-intel.Dockerfile platforms:linux/amd64 tag:light-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-cli.Dockerfile platforms:linux/amd64,linux/arm64 tag:light]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-cuda.Dockerfile platforms:linux/amd64 tag:server-cuda]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server-intel.Dockerfile platforms:linux/amd64 tag:server-intel]) (push) Waiting to run
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/llama-server.Dockerfile platforms:linux/amd64,linux/arm64 tag:server]) (push) Waiting to run
Nix CI / nix-eval (macos-latest) (push) Waiting to run
Nix CI / nix-eval (ubuntu-latest) (push) Waiting to run
Nix CI / nix-build (macos-latest) (push) Waiting to run
Nix CI / nix-build (ubuntu-latest) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Has been cancelled
* server : added with_pieces functionality to /tokenize endpoint * server : Add tokenize with pieces tests to server.feature * Handle case if tokenizer splits along utf8 continuation bytes * Add example of token splitting * Remove trailing ws * Fix trailing ws * Maybe fix ci * maybe this fix windows ci? --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
parent
e6b7801bd1
commit
78203641fe
1
.github/workflows/server.yml
vendored
1
.github/workflows/server.yml
vendored
@ -173,6 +173,7 @@ jobs:
|
|||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
|
$env:PYTHONIOENCODING = ":replace"
|
||||||
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
|
@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: (Required) The text to tokenize.
|
||||||
|
|
||||||
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||||
|
|
||||||
|
`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
|
||||||
|
|
||||||
|
**Response:**
|
||||||
|
|
||||||
|
Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
|
||||||
|
|
||||||
|
|
||||||
|
If `with_pieces` is `false`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [123, 456, 789]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
If `with_pieces` is `true`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 123, "piece": "Hello"},
|
||||||
|
{"id": 456, "piece": " world"},
|
||||||
|
{"id": 789, "piece": "!"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"tokens": [
|
||||||
|
{"id": 198, "piece": [195]}, // hex C3
|
||||||
|
{"id": 164, "piece": [161]} // hex A1
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### POST `/detokenize`: Convert tokens to text
|
### POST `/detokenize`: Convert tokens to text
|
||||||
|
|
||||||
|
@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
|
|||||||
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
const json body = json::parse(req.body);
|
const json body = json::parse(req.body);
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
json tokens_response = json::array();
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
const bool add_special = json_value(body, "add_special", false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||||
|
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||||
|
|
||||||
|
if (with_pieces) {
|
||||||
|
for (const auto& token : tokens) {
|
||||||
|
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
||||||
|
json piece_json;
|
||||||
|
|
||||||
|
// Check if the piece is valid UTF-8
|
||||||
|
if (is_valid_utf8(piece)) {
|
||||||
|
piece_json = piece;
|
||||||
|
} else {
|
||||||
|
// If not valid UTF-8, store as array of byte values
|
||||||
|
piece_json = json::array();
|
||||||
|
for (unsigned char c : piece) {
|
||||||
|
piece_json.push_back(static_cast<int>(c));
|
||||||
}
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
}
|
||||||
|
|
||||||
|
tokens_response.push_back({
|
||||||
|
{"id", token},
|
||||||
|
{"piece", piece_json}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tokens_response = tokens;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const json data = format_tokenizer_response(tokens_response);
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -105,6 +105,14 @@ Feature: llama.cpp server
|
|||||||
Given first token is removed
|
Given first token is removed
|
||||||
Then tokens can be detokenized
|
Then tokens can be detokenized
|
||||||
|
|
||||||
|
Scenario: Tokenize with pieces
|
||||||
|
When tokenizing with pieces:
|
||||||
|
"""
|
||||||
|
What is the capital of Germany?
|
||||||
|
媽
|
||||||
|
"""
|
||||||
|
Then tokens are given with pieces
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
Then 1 models are supported
|
Then 1 models are supported
|
||||||
|
@ -1,3 +1,6 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
|
|||||||
context.tokenize_add_special = True
|
context.tokenize_add_special = True
|
||||||
|
|
||||||
|
|
||||||
|
@step("tokenizing with pieces")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_tokenize_with_pieces(context):
|
||||||
|
context.tokenized_text = context_text(context)
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
|
||||||
|
if getattr(context, "tokenize_add_special", None) is not None:
|
||||||
|
tokenize_args["add_special"] = context.tokenize_add_special
|
||||||
|
|
||||||
|
async with session.post(
|
||||||
|
f"{context.base_url}/tokenize", json=tokenize_args
|
||||||
|
) as response:
|
||||||
|
assert response.status == 200
|
||||||
|
tokenize_json = await response.json()
|
||||||
|
context.tokens_with_pieces = tokenize_json["tokens"]
|
||||||
|
|
||||||
|
|
||||||
|
@step("tokens are given with pieces")
|
||||||
|
@async_run_until_complete
|
||||||
|
async def step_tokenize_with_pieces(context):
|
||||||
|
# Verify that the response contains both token IDs and pieces
|
||||||
|
assert all(
|
||||||
|
"id" in token and "piece" in token for token in context.tokens_with_pieces
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@step('tokenizing')
|
@step('tokenizing')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_tokenize(context):
|
async def step_tokenize(context):
|
||||||
|
@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
|
static bool is_valid_utf8(const std::string & str) {
|
||||||
|
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
|
||||||
|
const unsigned char* end = bytes + str.length();
|
||||||
|
|
||||||
|
while (bytes < end) {
|
||||||
|
if (*bytes <= 0x7F) {
|
||||||
|
// 1-byte sequence (0xxxxxxx)
|
||||||
|
bytes++;
|
||||||
|
} else if ((*bytes & 0xE0) == 0xC0) {
|
||||||
|
// 2-byte sequence (110xxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 2;
|
||||||
|
} else if ((*bytes & 0xF0) == 0xE0) {
|
||||||
|
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 3;
|
||||||
|
} else if ((*bytes & 0xF8) == 0xF0) {
|
||||||
|
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
||||||
|
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
|
||||||
|
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
|
||||||
|
return false;
|
||||||
|
bytes += 4;
|
||||||
|
} else {
|
||||||
|
// Invalid UTF-8 lead byte
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_tokenizer_response(const json & tokens) {
|
||||||
return json {
|
return json {
|
||||||
{"tokens", tokens}
|
{"tokens", tokens}
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user