From bc8648fbbeb8821b23bc06d9e09c3d269482b021 Mon Sep 17 00:00:00 2001 From: Robert Date: Sat, 16 Nov 2024 21:01:38 -0800 Subject: [PATCH] Update test-tokenizer-random.py Updated `find_first_mismatch` from suggestion by jaime-m-p regarding incorrect checking. --- tests/test-tokenizer-random.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py index 2bc14e23f..93da1d21f 100644 --- a/tests/test-tokenizer-random.py +++ b/tests/test-tokenizer-random.py @@ -423,8 +423,11 @@ def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLl # return -1 # return min(len(ids1), len(ids2)) # Rewritten to use zip() and next() instead of for loop - def find_first_mismatch(ids1: Sequence[Any], ids2: Sequence[Any]) -> int: - return next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) + def find_first_mismatch(ids1, ids2) -> int: + index = next((i for i, (a, b) in enumerate(zip(ids1, ids2)) if a != b), -1) + if index < 0 and len(ids1) != len(ids2): + index = min(len(ids1), len(ids2)) + return index def check_detokenizer(text: str, text1: str, text2: str) -> bool: if text1 == text2: # equal to TokenizerGroundtruth?