Skip to content
This repository was archived by the owner on Sep 10, 2025. It is now read-only.

Commit a94afd9

Browse files
committed
Unified tokenizer type onboarding
1 parent 0299a37 commit a94afd9

File tree

3 files changed

+23
-20
lines changed

3 files changed

+23
-20
lines changed

tokenizer/tokenizer_type.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
from enum import Enum
2+
3+
class TokenizerType(Enum):
4+
NONE = 0
5+
TIKTOKEN = 1
6+
SENTENCEPIECE = 2
7+
HF_TOKENIZER = 3
8+
9+
def is_tiktoken(self):
10+
return self == TokenizerType.TIKTOKEN
11+
def is_sentencepiece(self):
12+
return self == TokenizerType.SENTENCEPIECE
13+
def is_hf_tokenizer(self):
14+
return self == TokenizerType.HF_TOKENIZER
15+
def is_none(self):
16+
return self == TokenizerType.NONE

torchchat/cli/builder.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -238,11 +238,7 @@ def from_speculative_args(cls, args: argparse.Namespace) -> "BuilderArgs":
238238
speculative_builder_args.pte_path = None
239239
return speculative_builder_args
240240

241-
class TokenizerType(Enum):
242-
NONE = 0
243-
TIKTOKEN = 1
244-
SENTENCEPIECE = 2
245-
HF_TOKENIZER = 3
241+
from tokenizer.tokenizer_type import TokenizerType
246242

247243
@dataclass
248244
class TokenizerArgs:
@@ -278,15 +274,6 @@ def __post_init__(self):
278274
except:
279275
pass
280276

281-
def is_tiktoken(self) -> bool:
282-
return self.tokenizer_type == TokenizerType.TIKTOKEN
283-
284-
def is_sentencepiece(self) -> bool:
285-
return self.tokenizer_type == TokenizerType.SENTENCEPIECE
286-
287-
def is_hf_tokenizer(self) -> bool:
288-
return self.tokenizer_type == TokenizerType.HF_TOKENIZER
289-
290277
def validate_model(
291278
self,
292279
model: Optional[Model],
@@ -295,12 +282,12 @@ def validate_model(
295282
if model is None:
296283
return
297284

298-
if self.tokenizer_type == TokenizerType.NONE:
285+
if self.tokenizer_type.is_none():
299286
raise RuntimeError(f"no tokenizer was found at {self.tokenizer_path}")
300287

301-
is_tiktoken = self.is_tiktoken()
302-
is_sentencepiece = self.is_sentencepiece()
303-
is_hf_tokenizer = self.is_hf_tokenizer()
288+
is_tiktoken = self.tokenizer_type.is_tiktoken()
289+
is_sentencepiece = self.tokenizer_type.is_sentencepiece()
290+
is_hf_tokenizer = self.tokenizer_type.is_hf_tokenizer()
304291

305292
use_tiktoken = model.config.use_tiktoken
306293
use_hf_tokenizer = model.config.use_hf_tokenizer

torchchat/generate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,14 +365,14 @@ def __init__(
365365
# must use tiktokenizer.
366366
# Piggy backing off of this flag then for now to identify llama3
367367
# without prompting user.
368-
self.is_llama3_model = self.tokenizer_args.is_tiktoken()
368+
self.is_llama3_model = self.tokenizer_args.tokenizer_type.is_tiktoken()
369369
if self.is_llama3_model:
370370
self.chat_formatter = Llama3ChatFormatter(self.tokenizer)
371371
if generator_args.chat_mode:
372372
logger.debug(
373373
"Llama3 model detected in chat mode. Using updated sentence schemas"
374374
)
375-
elif self.tokenizer_args.is_hf_tokenizer():
375+
elif self.tokenizer_args.tokenizer_type.is_hf_tokenizer():
376376
if not self.tokenizer.has_chat_template():
377377
raise ValueError("Tokenizer must have a chat template")
378378
self.chat_formatter = HFTokenizerChatFormatter(self.tokenizer)

0 commit comments

Comments
 (0)