@@ -1326,108 +1326,6 @@ def decoder(self, replacement, add_prefix_space):
13261326 )
13271327
13281328
1329- class SpmTokenizer :
1330- """
1331- Base SentencePiece tokenizer that can be instantiated with model-specific arguments.
1332- This replaces the converter pattern with direct instantiation.
1333- """
1334-
1335- def __init__ (
1336- self ,
1337- handle_byte_fallback : bool = True ,
1338- legacy : bool = False ,
1339- add_prefix_space : bool = True ,
1340- special_tokens : Optional [dict ] = None ,
1341- vocab : Optional [callable ] = None ,
1342- unk_id : Optional [callable ] = None ,
1343- normalizer : Optional [callable ] = None ,
1344- pre_tokenizer : Optional [callable ] = None ,
1345- decoder : Optional [callable ] = None ,
1346- post_processor : Optional [callable ] = None ,
1347- ):
1348- self .handle_byte_fallback = handle_byte_fallback
1349- self .legacy = legacy
1350- self .add_prefix_space = add_prefix_space
1351- self .special_tokens = special_tokens or {}
1352- # Store user-provided callables under private names to avoid clashing with methods
1353- self ._vocab_fn = vocab
1354- self ._unk_id_fn = unk_id
1355- self ._normalizer_fn = normalizer
1356- self ._pre_tokenizer_fn = pre_tokenizer
1357- self ._decoder_fn = decoder
1358- self ._post_processor_fn = post_processor
1359-
1360- def vocab (self ):
1361- if self ._vocab_fn is not None :
1362- return self ._vocab_fn ()
1363- # Return empty vocab for training
1364- return []
1365-
1366- def unk_id (self ):
1367- if self ._unk_id_fn is not None :
1368- return self ._unk_id_fn ()
1369- return 0 # Default unk_id
1370-
1371- def tokenizer (self ):
1372- # Always create empty trainable tokenizer
1373- minimal_vocab = [("<unk>" , 0.0 )]
1374- return Tokenizer (Unigram (minimal_vocab , unk_id = self .unk_id (), byte_fallback = self .handle_byte_fallback ))
1375-
1376- def normalizer (self ):
1377- if self ._normalizer_fn is not None :
1378- return self ._normalizer_fn ()
1379- _normalizers = [
1380- normalizers .Strip (left = False , right = True ),
1381- normalizers .Replace (Regex (" {2,}" ), "▁" ),
1382- ]
1383- return normalizers .Sequence (_normalizers )
1384-
1385- def pre_tokenizer (self , replacement , add_prefix_space ):
1386- if self ._pre_tokenizer_fn is not None :
1387- return self ._pre_tokenizer_fn (replacement , add_prefix_space )
1388-
1389- prepend_scheme = _get_prepend_scheme (add_prefix_space , self )
1390- return pre_tokenizers .Metaspace (replacement = replacement , prepend_scheme = prepend_scheme )
1391-
1392- def decoder (self , replacement , add_prefix_space ):
1393- if self ._decoder_fn is not None :
1394- return self ._decoder_fn (replacement , add_prefix_space )
1395-
1396- prepend_scheme = _get_prepend_scheme (add_prefix_space , self )
1397- return decoders .Metaspace (replacement = replacement , prepend_scheme = prepend_scheme )
1398-
1399- def post_processor (self ):
1400- if self ._post_processor_fn is not None :
1401- return self ._post_processor_fn ()
1402- return None
1403-
1404- def create_tokenizer (self ) -> Tokenizer :
1405- """Create and return the configured empty trainable tokenizer."""
1406- tokenizer = self .tokenizer ()
1407-
1408- # Tokenizer assemble
1409- normalizer = self .normalizer ()
1410- if normalizer is not None :
1411- tokenizer .normalizer = normalizer
1412-
1413- replacement = "▁"
1414- add_prefix_space = self .add_prefix_space
1415-
1416- pre_tokenizer = self .pre_tokenizer (replacement , add_prefix_space )
1417- if pre_tokenizer is not None :
1418- tokenizer .pre_tokenizer = pre_tokenizer
1419-
1420- tokenizer .decoder = self .decoder (replacement , add_prefix_space )
1421- post_processor = self .post_processor ()
1422- if post_processor :
1423- tokenizer .post_processor = post_processor
1424-
1425- return tokenizer
1426-
1427-
1428- ## NOTE: LLaMA-specific converter moved to `models/llama/tokenization_llama_fast.py`.
1429- ## The slow->fast conversion for LLaMA is now handled directly in the fast file.
1430-
14311329class LlamaConverter (SpmConverter ):
14321330 handle_byte_fallback = True
14331331
@@ -1473,6 +1371,7 @@ def post_processor(self):
14731371 # the processor is defined in the LlamaTokenizerFast class.
14741372 return None
14751373
1374+
14761375class MarkupLMConverter (Converter ):
14771376 def converted (self ) -> Tokenizer :
14781377 ot = self .original_tokenizer
@@ -1801,11 +1700,10 @@ def converted(self) -> Tokenizer:
18011700 "XLNetTokenizer" : XLNetConverter ,
18021701 "SplinterTokenizer" : SplinterConverter ,
18031702 "XGLMTokenizer" : XGLMConverter ,
1804- # LLaMA converters moved into fast file; slow->fast conversion is handled there.
1805- # "LlamaTokenizer": LlamaConverter,
1806- # "CodeLlamaTokenizer": LlamaConverter,
1703+ "LlamaTokenizer" : LlamaConverter ,
1704+ "CodeLlamaTokenizer" : LlamaConverter ,
18071705 "GemmaTokenizer" : GemmaConverter ,
1808- # "Phi3Tokenizer": LlamaConverter,
1706+ "Phi3Tokenizer" : LlamaConverter ,
18091707}
18101708
18111709
@@ -1842,4 +1740,4 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni
18421740 f"Converting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
18431741 f"with a SentencePiece tokenizer.model file."
18441742 f"Currently available slow->fast converters: { list (SLOW_TO_FAST_CONVERTERS .keys ())} "
1845- )
1743+ )
0 commit comments