Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
277 commits
Select commit Hold shift + click to select a range
5fe5666
fixes missed
itazap Oct 10, 2025
51e62e1
gemma test fix
itazap Oct 10, 2025
0e5dbdf
refactor
itazap Oct 14, 2025
9136d3c
rm legacy from llama
itazap Oct 14, 2025
ab77f57
added renaming
itazap Oct 14, 2025
36bc3ef
add _model
itazap Oct 14, 2025
c4f045c
update legacy
itazap Oct 14, 2025
c80dd1d
update legacy
itazap Oct 14, 2025
790c092
fix docstring
itazap Oct 14, 2025
f4d956a
always load blank, then set _tokenizer if we have it
itazap Oct 14, 2025
b2c320c
new toks
itazap Oct 15, 2025
0c3caff
update all berttokenizer based models
itazap Oct 15, 2025
d43412a
apply feedback - delete bert duplicates
itazap Oct 16, 2025
48eeb50
more models --> fast only
itazap Oct 17, 2025
d3a3cbd
more convert_slow models
itazap Oct 20, 2025
493f9e0
fix common test refs
itazap Oct 20, 2025
a51cea0
updating fast only tokenizers
itazap Oct 20, 2025
d9c1ec3
openai and pegasus
itazap Oct 21, 2025
d879bc3
enable sentencepiecebackend
itazap Oct 22, 2025
ca51029
more models
itazap Oct 22, 2025
132c617
code gen
itazap Oct 22, 2025
ed5bf86
t5
itazap Oct 22, 2025
158b444
code gen tests
itazap Oct 22, 2025
64eaf88
speecht5
itazap Oct 22, 2025
95f48d3
mbart
itazap Oct 22, 2025
f3248d2
mbart50
itazap Oct 22, 2025
f3dd103
more models
itazap Oct 22, 2025
c66037d
more models
itazap Oct 23, 2025
cb5e08b
layouglmv2
itazap Oct 23, 2025
3159033
update tests
itazap Oct 24, 2025
a14a45d
update tests
itazap Oct 24, 2025
7ca10f8
update tests
itazap Oct 25, 2025
f5cbc49
pretrainedtokenizer
itazap Oct 27, 2025
72e8043
whisper
itazap Oct 28, 2025
3cd8e5b
whisper
itazap Oct 28, 2025
4bf2b85
layoutxlm and storing backends
itazap Oct 28, 2025
2ef0fd3
refactor sentencepiecebackend and additional_special_tokens
itazap Oct 29, 2025
5c7d347
renaming tokenization_utils --> tokenization_python
itazap Oct 29, 2025
fcf67ff
udpate tests
itazap Oct 30, 2025
a8ccf16
bert test
itazap Oct 30, 2025
ccca98e
blenderbot
itazap Oct 30, 2025
c118c10
clip
itazap Oct 30, 2025
0f74081
codegen
itazap Oct 30, 2025
a11dba7
code_llama
itazap Oct 30, 2025
b678cde
cohere
itazap Oct 30, 2025
ea9a546
deberata, deberat v2, funnel
itazap Oct 30, 2025
ffbdecf
gpt2
itazap Oct 30, 2025
9f08ade
batch update tests
itazap Oct 30, 2025
a7cd5c0
pegasus qwen2 roberta
itazap Oct 30, 2025
b5b3cd9
more models
itazap Oct 31, 2025
1250bcc
layout tests
itazap Oct 31, 2025
cf72cae
some renaming
itazap Oct 31, 2025
4fafdcc
fix references to utils_fast
itazap Oct 31, 2025
236f9f1
fix refs
itazap Oct 31, 2025
cd743bf
fix refs
itazap Oct 31, 2025
0e7e593
fix refs
itazap Oct 31, 2025
2af6d2c
fix refs
itazap Oct 31, 2025
b58b7b1
fix refs
itazap Oct 31, 2025
518dcaf
fix refs
itazap Oct 31, 2025
0f2f4b6
fix refs
itazap Oct 31, 2025
c849148
fix some tests
itazap Nov 2, 2025
0d54bbd
regression
itazap Nov 2, 2025
81a140a
fix refs
itazap Nov 2, 2025
61366d6
fix refs
itazap Nov 3, 2025
4374a66
missed the most crucial file in my last commit
itazap Nov 3, 2025
df383d7
fix refs
itazap Nov 4, 2025
b8035ec
fix refs
itazap Nov 4, 2025
37e1b92
fix refs
itazap Nov 4, 2025
9b45774
batch encode fix
itazap Nov 4, 2025
a24856d
fix some tests
itazap Nov 6, 2025
1868870
BC for batch_decode bc too many refs
itazap Nov 6, 2025
35dd250
more tests
itazap Nov 6, 2025
b0428f3
fix more tests
itazap Nov 6, 2025
8fe6873
fix for processors
itazap Nov 6, 2025
c1e0e46
fixing more models
itazap Nov 10, 2025
79568cd
deleted mbart50 by accident
itazap Nov 10, 2025
cfa159a
seamless m4t
itazap Nov 10, 2025
5854f4c
albert fix
itazap Nov 10, 2025
714a856
whisper
itazap Nov 11, 2025
c016f11
layout3
itazap Nov 11, 2025
2e3e178
attempt to fix cached tokenizers on CI
itazap Nov 11, 2025
03e3ab9
trying another fix on CI
itazap Nov 11, 2025
2c30d79
again try to work around CI
itazap Nov 11, 2025
98f51d5
bertweet
itazap Nov 11, 2025
96f0517
tapas
itazap Nov 11, 2025
c26f54b
mbart50
itazap Nov 12, 2025
da0bbf0
luke
itazap Nov 12, 2025
494ef3e
mluke
itazap Nov 13, 2025
39bb884
markuplm
itazap Nov 13, 2025
960dfcf
markuplm
itazap Nov 13, 2025
54992a0
fix some more auto tests
itazap Nov 13, 2025
d0383bd
some random model failures
itazap Nov 14, 2025
a969c6b
mistralcommontestser
itazap Nov 14, 2025
2bf4a13
more fixes
itazap Nov 14, 2025
e88322f
ref fix
itazap Nov 14, 2025
cfb0100
siglip
itazap Nov 14, 2025
0fd1066
marian
itazap Nov 14, 2025
02c524c
plbart
itazap Nov 14, 2025
820191e
update utils toks
itazap Nov 14, 2025
0cd714d
seamless m4t
itazap Nov 16, 2025
8a412bc
roc bert
itazap Nov 17, 2025
e8c3258
udpate byt5 test
itazap Nov 17, 2025
85a3b1f
xlm
itazap Nov 17, 2025
45e718f
esm
itazap Nov 17, 2025
96fc467
roformer
itazap Nov 17, 2025
7727e3b
code llama
itazap Nov 17, 2025
6795515
biogpt
itazap Nov 17, 2025
2f49a39
m2m100
itazap Nov 17, 2025
a42e7a8
dpr and flaubert
itazap Nov 18, 2025
33634be
xlm and speech to text
itazap Nov 18, 2025
ca5e389
tok backend pass object
itazap Nov 18, 2025
25021d4
tokenizer object pass
itazap Nov 18, 2025
69610fe
wav2vec2
itazap Nov 18, 2025
51799ca
wav2vec2
itazap Nov 18, 2025
f23abc3
cpmant
itazap Nov 18, 2025
88f0db5
update utils tokenizers
itazap Nov 18, 2025
077e6f8
cpmant
itazap Nov 18, 2025
e004b56
bartpho
itazap Nov 18, 2025
e069763
test apply chat template assistant mask
itazap Nov 18, 2025
9df9cfc
apply chat template video
itazap Nov 18, 2025
dc9b1ae
apply chat template assistant mask
itazap Nov 18, 2025
4c05e9d
test torch
itazap Nov 18, 2025
5c209a4
update from slow in base and fix donut processor errors
itazap Nov 19, 2025
d8a8db8
auto to point to tokenizers backend, fix kosmos2
itazap Nov 19, 2025
6b40d91
some non model fixes for old slow models that no longer have their ow…
itazap Nov 19, 2025
976265b
missed file from last commit
itazap Nov 19, 2025
b6ca8b2
idefics2
itazap Nov 19, 2025
5c72105
fixup
ArthurZucker Nov 19, 2025
964b461
fixup
ArthurZucker Nov 19, 2025
0381407
pretrained tokenizer fast test update
itazap Nov 19, 2025
887b477
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 19, 2025
f4c46ab
stash
ArthurZucker Nov 19, 2025
efbbb04
Merge branch 'one_tokenizer' of github.com:huggingface/transformers i…
ArthurZucker Nov 19, 2025
71ef282
bad merged
ArthurZucker Nov 19, 2025
a5b018c
cherry pick more stuff that did not merge well
ArthurZucker Nov 19, 2025
8ea91f6
fix gptsw3
ArthurZucker Nov 19, 2025
1947894
nit warn for now
ArthurZucker Nov 19, 2025
20a06ff
update error raising
ArthurZucker Nov 19, 2025
aa197a0
just ran fixup
ArthurZucker Nov 19, 2025
63c7c1c
bring back bert legacy
ArthurZucker Nov 19, 2025
5895bab
fix
ArthurZucker Nov 19, 2025
6b8217b
nit
ArthurZucker Nov 19, 2025
184ed58
fix 56 errors on blenderbotsmall?
ArthurZucker Nov 19, 2025
09e4021
18 for blenderbotsmall
ArthurZucker Nov 19, 2025
a8c299e
tok auto
itazap Nov 19, 2025
1259052
missed clip
itazap Nov 19, 2025
06e3485
fix tests
itazap Nov 20, 2025
3a95bf1
something missed
itazap Nov 20, 2025
05d5c08
token healing
itazap Nov 20, 2025
78f4e58
tok common tests update - nonmodel
itazap Nov 20, 2025
8fbaf83
try to fix non-model test in test_tokenization_utils
itazap Nov 20, 2025
fd40b1b
fix hub tests
itazap Nov 20, 2025
70330b8
try to fix hub tests
itazap Nov 20, 2025
7c78007
custom vocab related fixed
itazap Nov 20, 2025
ca1f6b0
bert jap
itazap Nov 20, 2025
dd3ae59
BERT JAP
itazap Nov 20, 2025
2e1893f
rename bert legacy to bert legacy
itazap Nov 20, 2025
f4be6a9
Wav2vec2
itazap Nov 20, 2025
919103a
fix in tok python to update total vocab size - fixes speech t5
itazap Nov 20, 2025
c452f92
blender bot small
itazap Nov 20, 2025
6d167eb
forgot test file
itazap Nov 20, 2025
025722b
test failures
itazap Nov 21, 2025
7d1d0d3
marian
itazap Nov 21, 2025
dfb67a4
gpt2 tiktoken
itazap Nov 21, 2025
51da6b2
big bird / marian
itazap Nov 21, 2025
c611058
udop
itazap Nov 21, 2025
cc4a972
forgot couple changes
itazap Nov 21, 2025
51202da
test_serve fix
itazap Nov 21, 2025
ca988b9
missing import
itazap Nov 21, 2025
f5bc69e
a couple processors fixes
itazap Nov 21, 2025
c67de10
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 24, 2025
045bbff
style partly
ArthurZucker Nov 24, 2025
75662fd
fix to fetch tests ci
itazap Nov 24, 2025
8d248a3
Revert branch back to commit f5bc69ef state
itazap Nov 24, 2025
4c29924
revert branch to styling
itazap Nov 24, 2025
189cabd
update mistral after merge
itazap Nov 24, 2025
e02741c
fixes for non model tests
itazap Nov 25, 2025
b828ae1
some processor test fixes
itazap Nov 26, 2025
83b579c
more processor test fixes
itazap Nov 26, 2025
2ce27bc
more processor fixes
itazap Nov 26, 2025
881b97c
hub tests
itazap Nov 26, 2025
2e28b3d
python tok utils
itazap Nov 26, 2025
925d187
fix hub test
itazap Nov 26, 2025
6624231
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 26, 2025
437321b
make style for now
ArthurZucker Nov 26, 2025
cd4d3ac
remove problemattic fic copies
ArthurZucker Nov 26, 2025
5c5864f
python utils/check_copies.py --fix_and_overwrite
ArthurZucker Nov 26, 2025
2f13c13
more styling
ArthurZucker Nov 26, 2025
1e1aa11
fixup
ArthurZucker Nov 26, 2025
5eeb1fe
silence docstirng
ArthurZucker Nov 26, 2025
dea8e1e
fix import?
ArthurZucker Nov 26, 2025
452d6d8
fix imports
ArthurZucker Nov 26, 2025
e650205
add the local test as well
ArthurZucker Nov 26, 2025
3dd1716
throw spm error
itazap Nov 26, 2025
e700dfa
llamas
itazap Nov 26, 2025
ce23d67
fix a couple tests
itazap Nov 26, 2025
ff1bf36
broke ci
itazap Nov 26, 2025
0bdfeae
broke ci
itazap Nov 26, 2025
a137649
broke ci
itazap Nov 26, 2025
366597c
broke ci
itazap Nov 26, 2025
22887b1
add logs to debug gemma on ci
itazap Nov 26, 2025
73819f4
gemma and llama
itazap Nov 26, 2025
c24c997
gemma
itazap Nov 26, 2025
551a959
revert las commit
itazap Nov 26, 2025
a18e84d
gemma debug
itazap Nov 26, 2025
c23ee13
gemma debug
itazap Nov 26, 2025
93187b3
gemma
itazap Nov 26, 2025
81428ef
safely import spiece backend
itazap Nov 27, 2025
eb95c2e
tok tests
itazap Nov 27, 2025
24d89c4
check none
itazap Nov 27, 2025
e2c4434
setup and qual
itazap Nov 27, 2025
7a737b7
ruff
itazap Nov 27, 2025
a19c90c
del dev files
itazap Nov 27, 2025
18e7484
tok auto
itazap Nov 27, 2025
3cdd8ee
fill docstrings
itazap Nov 27, 2025
50756c4
update auto
itazap Nov 27, 2025
6bccb46
blenderbot small nit
itazap Nov 27, 2025
a76015a
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 27, 2025
4afb570
add migration guide
ArthurZucker Nov 27, 2025
be1d95a
move mixtral patch to `TokenizersBackend`, move `TokenizerExtractor`
ArthurZucker Nov 27, 2025
fad31d7
rename MistralCommonTokenizer to MistralCommonB ackend
ArthurZucker Nov 27, 2025
d4aff20
Merge branch 'one_tokenizer' of github.com:huggingface/transformers i…
ArthurZucker Nov 27, 2025
3ab4bec
nit
ArthurZucker Nov 27, 2025
0c1a40a
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 27, 2025
30f1640
fix failures
ArthurZucker Nov 27, 2025
f2a1482
fixup
ArthurZucker Nov 27, 2025
d8010f8
remoove one old test
ArthurZucker Nov 27, 2025
82e5675
mark the slow one as slow
ArthurZucker Nov 27, 2025
088fc39
very small fixes
ArthurZucker Nov 27, 2025
f677ddf
update auto mapping for missing ones
ArthurZucker Nov 27, 2025
d30e46b
fixup lorsd
ArthurZucker Nov 27, 2025
ad24f43
fixup doc and stuff
ArthurZucker Nov 27, 2025
ebfe7f1
should be the final fixe
ArthurZucker Nov 27, 2025
c4a743d
processing update
ArthurZucker Nov 27, 2025
f81a966
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 27, 2025
9a5638d
update
ArthurZucker Nov 27, 2025
7c32dfb
FIX or brute AI fix the llava test
ArthurZucker Nov 27, 2025
c520a66
style
ArthurZucker Nov 27, 2025
718b2f0
slow?
ArthurZucker Nov 27, 2025
20d9036
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 27, 2025
8f536c2
fix is offline mode?
ArthurZucker Nov 27, 2025
e96c18b
fix mt5
ArthurZucker Nov 27, 2025
5ce65b8
One tok utils (#42462)
itazap Nov 27, 2025
4418e8a
Merge branch 'main' of github.com:huggingface/transformers into one_t…
ArthurZucker Nov 27, 2025
7f9954a
fix cohere
ArthurZucker Nov 27, 2025
bfa5fd0
Merge branch 'one_tokenizer' of github.com:huggingface/transformers i…
ArthurZucker Nov 27, 2025
4dce834
?
ArthurZucker Nov 27, 2025
fcdc9bb
up
ArthurZucker Nov 27, 2025
a5a3a7c
am I dumbb?
ArthurZucker Nov 27, 2025
0244be9
grumble
ArthurZucker Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
350 changes: 350 additions & 0 deletions tests/test_sentencepiece_backend_mixin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
# Sentencepiece backend layer tests

import pickle
import shutil
import tempfile
import unittest
from typing import TYPE_CHECKING

from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
from transformers.testing_utils import require_tokenizers
from transformers.tokenization_utils import AddedToken

if TYPE_CHECKING:
pass


class SentencePieceBackendTesterMixin:
"""
Tests that specifically test the SentencePiece backend.
"""

tokenizer_class = None
rust_tokenizer_class = None
test_sentencepiece = True
test_sentencepiece_ignore_case = False
test_slow_tokenizer = True
test_rust_tokenizer = True
from_pretrained_id = None
from_pretrained_kwargs = None

@classmethod
def setUpClass(cls) -> None:
cls.tmpdirname = tempfile.mkdtemp()

@classmethod
def tearDownClass(cls):
shutil.rmtree(cls.tmpdirname, ignore_errors=True)

@classmethod
def get_tokenizer(cls, **kwargs) -> PreTrainedTokenizer:
return cls.tokenizer_class.from_pretrained(cls.from_pretrained_id, **kwargs)

@classmethod
def get_rust_tokenizer(cls, **kwargs) -> PreTrainedTokenizerFast:
return cls.rust_tokenizer_class.from_pretrained(cls.from_pretrained_id, **kwargs)

def get_tokenizers(self, fast=True, **kwargs):
if fast and self.test_rust_tokenizer and self.test_slow_tokenizer:
return [self.get_tokenizer(**kwargs), self.get_rust_tokenizer(**kwargs)]
elif fast and self.test_rust_tokenizer:
return [self.get_rust_tokenizer(**kwargs)]
elif self.test_slow_tokenizer:
return [self.get_tokenizer(**kwargs)]
else:
raise ValueError("This tokenizer class has no tokenizer to be tested.")

def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
"""Test ``_tokenize`` and ``convert_tokens_to_string``."""
if not self.test_sentencepiece:
self.skipTest(reason="test_sentencepiece is set to False")

tokenizer = self.get_tokenizer()
text = "This is text to test the tokenizer."

if self.test_sentencepiece_ignore_case:
text = text.lower()

tokens = tokenizer.tokenize(text)

self.assertTrue(len(tokens) > 0)

# check if converting back to original text works
reverse_text = tokenizer.convert_tokens_to_string(tokens)

if self.test_sentencepiece_ignore_case:
reverse_text = reverse_text.lower()

self.assertEqual(reverse_text, text)

special_tokens = tokenizer.all_special_tokens
special_tokens_string = tokenizer.convert_tokens_to_string(special_tokens)
for special_token in special_tokens:
self.assertIn(special_token, special_tokens_string)

if self.test_rust_tokenizer:
rust_tokenizer = self.get_rust_tokenizer()
special_tokens_string_rust = rust_tokenizer.convert_tokens_to_string(special_tokens)
self.assertEqual(special_tokens_string, special_tokens_string_rust)

def test_sentencepiece_tokenize_and_decode(self):
if not self.test_sentencepiece:
self.skipTest(reason="test_sentencepiece is set to False")

text = "This is text to test the tokenizer."
if self.test_rust_tokenizer:
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

slow_ids = tokenizer(text).input_ids
fast_ids = rust_tokenizer(text).input_ids
self.assertEqual(slow_ids, fast_ids)

slow_decoded = tokenizer.decode(slow_ids)
fast_decoded = rust_tokenizer.decode(slow_ids)
self.assertEqual(slow_decoded, fast_decoded)

def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
text = "This is text to test the tokenizer."

tokenizer_slow_1 = self.get_tokenizer()
encoding_tokenizer_slow_1 = tokenizer_slow_1(text)

tmpdirname_1 = tempfile.mkdtemp()
tmpdirname_2 = tempfile.mkdtemp()

tokenizer_slow_1.save_pretrained(tmpdirname_1)
tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
encoding_tokenizer_slow_2 = tokenizer_slow_2(text)

shutil.rmtree(tmpdirname_1)
tokenizer_slow_2.save_pretrained(tmpdirname_2)

tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
encoding_tokenizer_slow_3 = tokenizer_slow_3(text)
shutil.rmtree(tmpdirname_2)

self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)

def test_added_token_are_matched_longest_first(self):
tokenizers = self.get_tokenizers(fast=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
try:
tokenizer.add_tokens([AddedToken("extra_id_1")])
tokenizer.add_tokens([AddedToken("extra_id_100")])
except Exception:
# Canine cannot add tokens which are not codepoints
self.skipTest(reason="Cannot add those Added tokens")

# XXX: This used to split on `extra_id_1` first we're matching
# longest first now.
tokens = tokenizer.tokenize("This is some extra_id_100")
self.assertIn("extra_id_100", tokens)

for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
tokenizer.add_tokens([AddedToken("extra_id_100")])
tokenizer.add_tokens([AddedToken("extra_id_1")])

tokens = tokenizer.tokenize("This is some extra_id_100")
self.assertIn("extra_id_100", tokens)

@require_tokenizers
def test_pickle_added_tokens(self):
tok1 = AddedToken("<s>", rstrip=True, lstrip=True, normalized=False, single_word=True)
tok2 = pickle.loads(pickle.dumps(tok1))

self.assertEqual(tok1.__getstate__(), tok2.__getstate__())

def test_added_tokens_do_lower_case(self):
tokenizer = self.get_tokenizer(do_lower_case=True)
if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case:
self.skipTest(reason="Tokenizer does not support do_lower_case")

special_token = tokenizer.all_special_tokens[0]

text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token
text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token

toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks

new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"]
added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks])

toks_after_adding = tokenizer.tokenize(text)
toks_after_adding2 = tokenizer.tokenize(text2)

# Rust tokenizers don't lowercase added tokens at the time calling `tokenizer.add_tokens`,
# while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3.
self.assertIn(added, [2, 4])

self.assertListEqual(toks_after_adding, toks_after_adding2)
self.assertTrue(
len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer
)

# Check that none of the special tokens are lowercased
sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B"
# Convert the tokenized list to str as some special tokens are tokenized like normal tokens
# which have a prefix spacee e.g. the mask token of Albert, and cannot match the original
# special tokens exactly.
tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens))

for special_token in tokenizer.all_special_tokens:
self.assertTrue(special_token in tokenized_sequence or special_token.lower() in tokenized_sequence)

def test_add_tokens_tokenizer(self):
tokenizer = self.get_tokenizer(do_lower_case=False)
vocab_size = tokenizer.vocab_size
all_size = len(tokenizer)

self.assertNotEqual(vocab_size, 0)

new_toks = [
AddedToken("aaaaa bbbbbb", rstrip=True, lstrip=True),
AddedToken("cccccccccdddddddd", rstrip=True, lstrip=True),
]
added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer)

self.assertNotEqual(vocab_size_2, 0)
self.assertEqual(vocab_size, vocab_size_2)
self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks))

tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)

self.assertGreaterEqual(len(tokens), 4)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)

new_toks_2 = {
"eos_token": AddedToken(">>>>|||<||<<|<<", rstrip=True, lstrip=True),
"pad_token": AddedToken("<<<<<|||>|>>>>|>", rstrip=True, lstrip=True),
}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer)

self.assertNotEqual(vocab_size_3, 0)
self.assertEqual(vocab_size, vocab_size_3)
self.assertEqual(added_toks_2, len(new_toks_2))
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))

tokens = tokenizer.encode(
">>>>|||<||<<|<< aaaaa bbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l", add_special_tokens=False
)

self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1])

self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.pad_token_id)

def test_add_special_tokens(self):
self.skipTest(reason="Redundant with test_add_tokens_tokenizer")

def test_add_tokens(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer_r = self.get_rust_tokenizer()

vocab_size = len(tokenizer_r)
self.assertEqual(tokenizer_r.add_tokens(""), 0)
self.assertEqual(tokenizer_r.add_tokens("testoken"), 1)
self.assertEqual(tokenizer_r.add_tokens(["testoken1", "testtoken2"]), 2)
self.assertEqual(len(tokenizer_r), vocab_size + 3)

self.assertEqual(tokenizer_r.add_special_tokens({}), 0)
self.assertEqual(tokenizer_r.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
self.assertRaises(
AssertionError, tokenizer_r.add_special_tokens, {"additional_special_tokens": "<testtoken1>"}
)
self.assertEqual(tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken2>"]}), 1)
self.assertEqual(
tokenizer_r.add_special_tokens({"additional_special_tokens": ["<testtoken3>", "<testtoken4>"]}), 2
)
self.assertIn("<testtoken3>", tokenizer_r.special_tokens_map["additional_special_tokens"])
self.assertIsInstance(tokenizer_r.special_tokens_map["additional_special_tokens"], list)
self.assertGreaterEqual(len(tokenizer_r.special_tokens_map["additional_special_tokens"]), 2)

self.assertEqual(len(tokenizer_r), vocab_size + 8)

def test_compare_add_special_tokens(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer_r = self.get_rust_tokenizer()

simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)

for text in ["", " "]:
# tokenize()
no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
self.assertEqual(
len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add
)

# Single input
no_special_tokens = tokenizer_r(text, add_special_tokens=False)
with_special_tokens = tokenizer_r(text, add_special_tokens=True)
for key in no_special_tokens:
self.assertEqual(
len(no_special_tokens[key]),
len(with_special_tokens[key]) - simple_num_special_tokens_to_add,
)

# Batched input
no_special_tokens = tokenizer_r([text, text], add_special_tokens=False)
with_special_tokens = tokenizer_r([text, text], add_special_tokens=True)
for key in no_special_tokens:
for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)

def test_special_tokens_initialization(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

added_tokens = [AddedToken("<special>", lstrip=True)]
tokenizer_r = self.get_rust_tokenizer(additional_special_tokens=added_tokens)
r_output = tokenizer_r.encode("Hey this is a <special> token")

special_token_id = tokenizer_r.encode("<special>", add_special_tokens=False)[0]

self.assertTrue(special_token_id in r_output)

def test_special_token_addition(self):
tokenizer = self.get_tokenizer()
# Create tokenizer and add an additional special token
tokenizer.add_special_tokens({"additional_special_tokens": ["<tok>"]})
self.assertEqual(tokenizer.additional_special_tokens, ["<tok>"])
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer.save_pretrained(tmp_dir)
# Load the above tokenizer and add the same special token a second time
tokenizer_2 = self.tokenizer_class.from_pretrained(tmp_dir)
tokenizer_2.add_special_tokens({"additional_special_tokens": ["<tok>"]})
self.assertEqual(tokenizer_2.additional_special_tokens, ["<tok>"])

tokenizer_2.add_special_tokens({"additional_special_tokens": ["<tok>", "<other>"]})
self.assertEqual(tokenizer_2.additional_special_tokens, ["<tok>", "<other>"])

tokenizer_2.add_special_tokens({"additional_special_tokens": ["<other>", "<another>"]})
self.assertEqual(tokenizer_2.additional_special_tokens, ["<other>", "<another>"])

tokenizer_2.add_special_tokens(
{"additional_special_tokens": ["<tok>"]},
replace_additional_special_tokens=False,
)
self.assertEqual(tokenizer_2.additional_special_tokens, ["<other>", "<another>", "<tok>"])
Loading
Loading