Skip to content

Commit 9511458

Browse files
committed
create_fast_tokenizer file
1 parent f16b430 commit 9511458

File tree

3 files changed

+138
-108
lines changed

3 files changed

+138
-108
lines changed

src/transformers/convert_slow_tokenizer.py

Lines changed: 5 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -1326,108 +1326,6 @@ def decoder(self, replacement, add_prefix_space):
13261326
)
13271327

13281328

1329-
class SpmTokenizer:
1330-
"""
1331-
Base SentencePiece tokenizer that can be instantiated with model-specific arguments.
1332-
This replaces the converter pattern with direct instantiation.
1333-
"""
1334-
1335-
def __init__(
1336-
self,
1337-
handle_byte_fallback: bool = True,
1338-
legacy: bool = False,
1339-
add_prefix_space: bool = True,
1340-
special_tokens: Optional[dict] = None,
1341-
vocab: Optional[callable] = None,
1342-
unk_id: Optional[callable] = None,
1343-
normalizer: Optional[callable] = None,
1344-
pre_tokenizer: Optional[callable] = None,
1345-
decoder: Optional[callable] = None,
1346-
post_processor: Optional[callable] = None,
1347-
):
1348-
self.handle_byte_fallback = handle_byte_fallback
1349-
self.legacy = legacy
1350-
self.add_prefix_space = add_prefix_space
1351-
self.special_tokens = special_tokens or {}
1352-
# Store user-provided callables under private names to avoid clashing with methods
1353-
self._vocab_fn = vocab
1354-
self._unk_id_fn = unk_id
1355-
self._normalizer_fn = normalizer
1356-
self._pre_tokenizer_fn = pre_tokenizer
1357-
self._decoder_fn = decoder
1358-
self._post_processor_fn = post_processor
1359-
1360-
def vocab(self):
1361-
if self._vocab_fn is not None:
1362-
return self._vocab_fn()
1363-
# Return empty vocab for training
1364-
return []
1365-
1366-
def unk_id(self):
1367-
if self._unk_id_fn is not None:
1368-
return self._unk_id_fn()
1369-
return 0 # Default unk_id
1370-
1371-
def tokenizer(self):
1372-
# Always create empty trainable tokenizer
1373-
minimal_vocab = [("<unk>", 0.0)]
1374-
return Tokenizer(Unigram(minimal_vocab, unk_id=self.unk_id(), byte_fallback=self.handle_byte_fallback))
1375-
1376-
def normalizer(self):
1377-
if self._normalizer_fn is not None:
1378-
return self._normalizer_fn()
1379-
_normalizers = [
1380-
normalizers.Strip(left=False, right=True),
1381-
normalizers.Replace(Regex(" {2,}"), "▁"),
1382-
]
1383-
return normalizers.Sequence(_normalizers)
1384-
1385-
def pre_tokenizer(self, replacement, add_prefix_space):
1386-
if self._pre_tokenizer_fn is not None:
1387-
return self._pre_tokenizer_fn(replacement, add_prefix_space)
1388-
1389-
prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
1390-
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
1391-
1392-
def decoder(self, replacement, add_prefix_space):
1393-
if self._decoder_fn is not None:
1394-
return self._decoder_fn(replacement, add_prefix_space)
1395-
1396-
prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
1397-
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
1398-
1399-
def post_processor(self):
1400-
if self._post_processor_fn is not None:
1401-
return self._post_processor_fn()
1402-
return None
1403-
1404-
def create_tokenizer(self) -> Tokenizer:
1405-
"""Create and return the configured empty trainable tokenizer."""
1406-
tokenizer = self.tokenizer()
1407-
1408-
# Tokenizer assemble
1409-
normalizer = self.normalizer()
1410-
if normalizer is not None:
1411-
tokenizer.normalizer = normalizer
1412-
1413-
replacement = "▁"
1414-
add_prefix_space = self.add_prefix_space
1415-
1416-
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
1417-
if pre_tokenizer is not None:
1418-
tokenizer.pre_tokenizer = pre_tokenizer
1419-
1420-
tokenizer.decoder = self.decoder(replacement, add_prefix_space)
1421-
post_processor = self.post_processor()
1422-
if post_processor:
1423-
tokenizer.post_processor = post_processor
1424-
1425-
return tokenizer
1426-
1427-
1428-
## NOTE: LLaMA-specific converter moved to `models/llama/tokenization_llama_fast.py`.
1429-
## The slow->fast conversion for LLaMA is now handled directly in the fast file.
1430-
14311329
class LlamaConverter(SpmConverter):
14321330
handle_byte_fallback = True
14331331

@@ -1473,6 +1371,7 @@ def post_processor(self):
14731371
# the processor is defined in the LlamaTokenizerFast class.
14741372
return None
14751373

1374+
14761375
class MarkupLMConverter(Converter):
14771376
def converted(self) -> Tokenizer:
14781377
ot = self.original_tokenizer
@@ -1801,11 +1700,10 @@ def converted(self) -> Tokenizer:
18011700
"XLNetTokenizer": XLNetConverter,
18021701
"SplinterTokenizer": SplinterConverter,
18031702
"XGLMTokenizer": XGLMConverter,
1804-
# LLaMA converters moved into fast file; slow->fast conversion is handled there.
1805-
# "LlamaTokenizer": LlamaConverter,
1806-
# "CodeLlamaTokenizer": LlamaConverter,
1703+
"LlamaTokenizer": LlamaConverter,
1704+
"CodeLlamaTokenizer": LlamaConverter,
18071705
"GemmaTokenizer": GemmaConverter,
1808-
# "Phi3Tokenizer": LlamaConverter,
1706+
"Phi3Tokenizer": LlamaConverter,
18091707
}
18101708

18111709

@@ -1842,4 +1740,4 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
18421740
f"Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
18431741
f"with a SentencePiece tokenizer.model file."
18441742
f"Currently available slow->fast converters: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
1845-
)
1743+
)
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright 20125 The HuggingFace Inc. team.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""
15+
Utilities for creating fast tokenizers from scratch.
16+
"""
17+
18+
from typing import Optional
19+
20+
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
21+
from tokenizers.models import BPE, Unigram
22+
23+
24+
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
25+
if add_prefix_space:
26+
prepend_scheme = "always"
27+
if not getattr(original_tokenizer, "legacy", True):
28+
prepend_scheme = "first"
29+
else:
30+
prepend_scheme = "never"
31+
return prepend_scheme
32+
33+
34+
class SpmTokenizer:
35+
"""
36+
Base SentencePiece tokenizer that can be instantiated with model-specific arguments.
37+
"""
38+
39+
def __init__(
40+
self,
41+
handle_byte_fallback: bool = True,
42+
legacy: bool = False,
43+
add_prefix_space: bool = True,
44+
special_tokens: Optional[dict] = None,
45+
vocab: Optional[callable] = None,
46+
unk_id: Optional[callable] = None,
47+
normalizer: Optional[callable] = None,
48+
pre_tokenizer: Optional[callable] = None,
49+
decoder: Optional[callable] = None,
50+
post_processor: Optional[callable] = None,
51+
):
52+
self.handle_byte_fallback = handle_byte_fallback
53+
self.legacy = legacy
54+
self.add_prefix_space = add_prefix_space
55+
self.special_tokens = special_tokens or {}
56+
# Store user-provided callables under private names to avoid clashing with methods
57+
self._vocab_fn = vocab
58+
self._unk_id_fn = unk_id
59+
self._normalizer_fn = normalizer
60+
self._pre_tokenizer_fn = pre_tokenizer
61+
self._decoder_fn = decoder
62+
self._post_processor_fn = post_processor
63+
64+
def vocab(self):
65+
if self._vocab_fn is not None:
66+
return self._vocab_fn()
67+
# Return empty vocab for training
68+
return []
69+
70+
def unk_id(self):
71+
if self._unk_id_fn is not None:
72+
return self._unk_id_fn()
73+
return 0 # Default unk_id
74+
75+
def tokenizer(self):
76+
# Always create empty trainable tokenizer
77+
minimal_vocab = [("<unk>", 0.0)]
78+
return Tokenizer(Unigram(minimal_vocab, unk_id=self.unk_id(), byte_fallback=self.handle_byte_fallback))
79+
80+
def normalizer(self):
81+
if self._normalizer_fn is not None:
82+
return self._normalizer_fn()
83+
_normalizers = [
84+
normalizers.Strip(left=False, right=True),
85+
normalizers.Replace(Regex(" {2,}"), "▁"),
86+
]
87+
return normalizers.Sequence(_normalizers)
88+
89+
def pre_tokenizer(self, replacement, add_prefix_space):
90+
if self._pre_tokenizer_fn is not None:
91+
return self._pre_tokenizer_fn(replacement, add_prefix_space)
92+
93+
prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
94+
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
95+
96+
def decoder(self, replacement, add_prefix_space):
97+
if self._decoder_fn is not None:
98+
return self._decoder_fn(replacement, add_prefix_space)
99+
100+
prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
101+
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)
102+
103+
def post_processor(self):
104+
if self._post_processor_fn is not None:
105+
return self._post_processor_fn()
106+
return None
107+
108+
def create_tokenizer(self) -> Tokenizer:
109+
"""Create and return the configured empty trainable tokenizer."""
110+
tokenizer = self.tokenizer()
111+
112+
# Tokenizer assemble
113+
normalizer = self.normalizer()
114+
if normalizer is not None:
115+
tokenizer.normalizer = normalizer
116+
117+
replacement = "▁"
118+
add_prefix_space = self.add_prefix_space
119+
120+
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
121+
if pre_tokenizer is not None:
122+
tokenizer.pre_tokenizer = pre_tokenizer
123+
124+
tokenizer.decoder = self.decoder(replacement, add_prefix_space)
125+
post_processor = self.post_processor()
126+
if post_processor:
127+
tokenizer.post_processor = post_processor
128+
129+
return tokenizer
130+
131+
132+
__all__ = ["SpmTokenizer", "_get_prepend_scheme"]

src/transformers/models/llama/tokenization_llama_fast.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
from ...tokenization_utils_fast import PreTrainedTokenizerFast
2424
from ...utils import is_sentencepiece_available, logging, requires_backends
25-
from ...convert_slow_tokenizer import _get_prepend_scheme, SpmTokenizer
25+
from ...create_fast_tokenizer import SpmTokenizer, _get_prepend_scheme
2626

2727

2828
logger = logging.get_logger(__name__)

0 commit comments

Comments
 (0)