mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2024-12-25 02:44:36 +00:00
server : add_special option for tokenize endpoint (#7059)
This commit is contained in:
parent
ad211edef5
commit
911b3900dd
@ -331,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
|||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
Note that a special `BOS` token is never inserted.
|
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||||
|
|
||||||
- **POST** `/detokenize`: Convert tokens to text.
|
- **POST** `/detokenize`: Convert tokens to text.
|
||||||
|
|
||||||
|
@ -3647,7 +3647,8 @@ int main(int argc, char ** argv) {
|
|||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
tokens = ctx_server.tokenize(body["content"], false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
|
tokens = ctx_server.tokenize(body["content"], add_special);
|
||||||
}
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||||
|
@ -7,6 +7,7 @@ Feature: llama.cpp server
|
|||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
And a model file test-model.gguf
|
And a model file test-model.gguf
|
||||||
And a model alias tinyllama-2
|
And a model alias tinyllama-2
|
||||||
|
And BOS token is 1
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
# KV Cache corresponds to the total amount of tokens
|
# KV Cache corresponds to the total amount of tokens
|
||||||
# that can be stored across all independent sequences: #4130
|
# that can be stored across all independent sequences: #4130
|
||||||
@ -91,7 +92,18 @@ Feature: llama.cpp server
|
|||||||
"""
|
"""
|
||||||
What is the capital of France ?
|
What is the capital of France ?
|
||||||
"""
|
"""
|
||||||
Then tokens can be detokenize
|
Then tokens can be detokenized
|
||||||
|
And tokens do not begin with BOS
|
||||||
|
|
||||||
|
Scenario: Tokenize w/ BOS
|
||||||
|
Given adding special tokens
|
||||||
|
When tokenizing:
|
||||||
|
"""
|
||||||
|
What is the capital of Germany?
|
||||||
|
"""
|
||||||
|
Then tokens begin with BOS
|
||||||
|
Given first token is removed
|
||||||
|
Then tokens can be detokenized
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
|
@ -376,6 +376,11 @@ def step_seed(context, seed):
|
|||||||
context.seed.append(seed)
|
context.seed.append(seed)
|
||||||
|
|
||||||
|
|
||||||
|
@step('BOS token is {bos:d}')
|
||||||
|
def step_bos_token(context, bos):
|
||||||
|
context.bos = bos
|
||||||
|
|
||||||
|
|
||||||
@step('a prefix prompt')
|
@step('a prefix prompt')
|
||||||
def step_prompt_prefix(context):
|
def step_prompt_prefix(context):
|
||||||
context.prompt_prefix = context_text(context)
|
context.prompt_prefix = context_text(context)
|
||||||
@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
|
|||||||
assert_embeddings(context.tasks_result.pop().pop())
|
assert_embeddings(context.tasks_result.pop().pop())
|
||||||
|
|
||||||
|
|
||||||
|
@step('adding special tokens')
|
||||||
|
def step_tokenize_set_add_special(context):
|
||||||
|
context.tokenize_add_special = True
|
||||||
|
|
||||||
|
|
||||||
@step('tokenizing')
|
@step('tokenizing')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_tokenize(context):
|
async def step_tokenize(context):
|
||||||
context.tokenized_text = context_text(context)
|
context.tokenized_text = context_text(context)
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
|
tokenize_args = {
|
||||||
|
"content": context.tokenized_text,
|
||||||
|
}
|
||||||
|
if getattr(context, 'tokenize_add_special', None) is not None:
|
||||||
|
tokenize_args['add_special'] = context.tokenize_add_special
|
||||||
async with session.post(f'{context.base_url}/tokenize',
|
async with session.post(f'{context.base_url}/tokenize',
|
||||||
json={
|
json=tokenize_args) as response:
|
||||||
"content": context.tokenized_text,
|
|
||||||
}) as response:
|
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
tokenize_json = await response.json()
|
tokenize_json = await response.json()
|
||||||
context.tokens = tokenize_json['tokens']
|
context.tokens = tokenize_json['tokens']
|
||||||
|
|
||||||
|
|
||||||
@step('tokens can be detokenize')
|
@step('tokens can be detokenized')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_detokenize(context):
|
async def step_detokenize(context):
|
||||||
assert len(context.tokens) > 0
|
assert len(context.tokens) > 0
|
||||||
@ -685,6 +698,21 @@ async def step_detokenize(context):
|
|||||||
assert context.tokenized_text == detokenize_json['content'].strip()
|
assert context.tokenized_text == detokenize_json['content'].strip()
|
||||||
|
|
||||||
|
|
||||||
|
@step('tokens begin with BOS')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
assert context.tokens[0] == context.bos
|
||||||
|
|
||||||
|
|
||||||
|
@step('tokens do not begin with BOS')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
assert context.tokens[0] != context.bos
|
||||||
|
|
||||||
|
|
||||||
|
@step('first token is removed')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
context.tokens = context.tokens[1:]
|
||||||
|
|
||||||
|
|
||||||
@step('an OPTIONS request is sent from {origin}')
|
@step('an OPTIONS request is sent from {origin}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_options_request(context, origin):
|
async def step_options_request(context, origin):
|
||||||
|
Loading…
Reference in New Issue
Block a user