|
26 | 26 | from ...configuration_utils import PretrainedConfig |
27 | 27 | from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code |
28 | 28 | from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint |
| 29 | +from ...tokenization_sentencepiece import PreTrainedSentencePieceTokenizer |
29 | 30 | from ...tokenization_utils import PreTrainedTokenizer |
30 | 31 | from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE |
31 | 32 | from ...utils import ( |
32 | 33 | cached_file, |
33 | 34 | extract_commit_hash, |
| 35 | + has_file, |
34 | 36 | is_g2p_en_available, |
35 | 37 | is_sentencepiece_available, |
36 | 38 | is_tokenizers_available, |
|
356 | 358 | ( |
357 | 359 | "llama", |
358 | 360 | ( |
359 | | - "LlamaTokenizer" if is_sentencepiece_available() else None, |
| 361 | + None, |
360 | 362 | "LlamaTokenizerFast" if is_tokenizers_available() else None, |
361 | 363 | ), |
362 | 364 | ), |
@@ -1133,6 +1135,31 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): |
1133 | 1135 | tokenizer_class_candidate = config_tokenizer_class |
1134 | 1136 | tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) |
1135 | 1137 | if tokenizer_class is None: |
| 1138 | + try: |
| 1139 | + vocab_file_exists = has_file( |
| 1140 | + pretrained_model_name_or_path, |
| 1141 | + "tokenizer.model", |
| 1142 | + revision=kwargs.get("revision", None), |
| 1143 | + token=kwargs.get("token", None), |
| 1144 | + cache_dir=kwargs.get("cache_dir", None), |
| 1145 | + local_files_only=kwargs.get("local_files_only", False), |
| 1146 | + ) |
| 1147 | + except Exception: |
| 1148 | + vocab_file_exists = False |
| 1149 | + |
| 1150 | + if vocab_file_exists: |
| 1151 | + logger.info( |
| 1152 | + "Falling back to PreTrainedSentencePieceTokenizer since tokenizer.model file was found " |
| 1153 | + "but no config or tokenizer class could be determined." |
| 1154 | + ) |
| 1155 | + return PreTrainedSentencePieceTokenizer.from_pretrained( |
| 1156 | + pretrained_model_name_or_path, *inputs, **kwargs |
| 1157 | + ) |
| 1158 | + |
| 1159 | + raise ValueError( |
| 1160 | + f"Could not load tokenizer from {pretrained_model_name_or_path}. " |
| 1161 | + "No tokenizer configuration or model config could be found." |
| 1162 | + ) |
1136 | 1163 | raise ValueError( |
1137 | 1164 | f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." |
1138 | 1165 | ) |
|
0 commit comments