Skip to content

Commit 0e0a75f

Browse files
committed
load PreTrainedSentencePieceTokenizer fallback
1 parent 6c25f26 commit 0e0a75f

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

src/transformers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@
175175
"processing_utils": ["ProcessorMixin"],
176176
"quantizers": [],
177177
"testing_utils": [],
178+
"tokenization_sentencepiece": ["PreTrainedSentencePieceTokenizer"],
178179
"tokenization_utils": ["PreTrainedTokenizer"],
179180
"tokenization_utils_base": [
180181
"AddedToken",
@@ -710,6 +711,7 @@
710711
from .pytorch_utils import prune_layer as prune_layer
711712

712713
# Tokenization
714+
from .tokenization_sentencepiece import PreTrainedSentencePieceTokenizer as PreTrainedSentencePieceTokenizer
713715
from .tokenization_utils import PreTrainedTokenizer as PreTrainedTokenizer
714716
from .tokenization_utils_base import AddedToken as AddedToken
715717
from .tokenization_utils_base import BatchEncoding as BatchEncoding

src/transformers/models/auto/tokenization_auto.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@
2626
from ...configuration_utils import PretrainedConfig
2727
from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code
2828
from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint
29+
from ...tokenization_sentencepiece import PreTrainedSentencePieceTokenizer
2930
from ...tokenization_utils import PreTrainedTokenizer
3031
from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE
3132
from ...utils import (
3233
cached_file,
3334
extract_commit_hash,
35+
has_file,
3436
is_g2p_en_available,
3537
is_sentencepiece_available,
3638
is_tokenizers_available,
@@ -356,7 +358,7 @@
356358
(
357359
"llama",
358360
(
359-
"LlamaTokenizer" if is_sentencepiece_available() else None,
361+
None,
360362
"LlamaTokenizerFast" if is_tokenizers_available() else None,
361363
),
362364
),
@@ -1133,6 +1135,31 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
11331135
tokenizer_class_candidate = config_tokenizer_class
11341136
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
11351137
if tokenizer_class is None:
1138+
try:
1139+
vocab_file_exists = has_file(
1140+
pretrained_model_name_or_path,
1141+
"tokenizer.model",
1142+
revision=kwargs.get("revision", None),
1143+
token=kwargs.get("token", None),
1144+
cache_dir=kwargs.get("cache_dir", None),
1145+
local_files_only=kwargs.get("local_files_only", False),
1146+
)
1147+
except Exception:
1148+
vocab_file_exists = False
1149+
1150+
if vocab_file_exists:
1151+
logger.info(
1152+
"Falling back to PreTrainedSentencePieceTokenizer since tokenizer.model file was found "
1153+
"but no config or tokenizer class could be determined."
1154+
)
1155+
return PreTrainedSentencePieceTokenizer.from_pretrained(
1156+
pretrained_model_name_or_path, *inputs, **kwargs
1157+
)
1158+
1159+
raise ValueError(
1160+
f"Could not load tokenizer from {pretrained_model_name_or_path}. "
1161+
"No tokenizer configuration or model config could be found."
1162+
)
11361163
raise ValueError(
11371164
f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
11381165
)

0 commit comments

Comments
 (0)