Skip to content

Commit e51e75e

Browse files
authored
remove tokenizer warning (#42483)
* fix warning * fix * remove
1 parent 0c05c3b commit e51e75e

File tree

1 file changed

+1
-51
lines changed

1 file changed

+1
-51
lines changed

src/transformers/tokenization_utils_base.py

Lines changed: 1 addition & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1827,16 +1827,14 @@ def _from_pretrained(
18271827
if tokenizer_config_file is not None:
18281828
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
18291829
init_kwargs = json.load(tokenizer_config_handle)
1830-
# First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
1831-
config_tokenizer_class = init_kwargs.get("tokenizer_class")
1830+
# used in the past to check if the tokenizer class matches the class in the repo
18321831
init_kwargs.pop("tokenizer_class", None)
18331832
if not has_tokenizer_file:
18341833
init_kwargs.get("tokenizer_file", None)
18351834
saved_init_inputs = init_kwargs.pop("init_inputs", ())
18361835
if not init_inputs:
18371836
init_inputs = saved_init_inputs
18381837
else:
1839-
config_tokenizer_class = None
18401838
init_kwargs = init_configuration
18411839

18421840
# If independent chat template file(s) exist, they take priority over template entries in the tokenizer config
@@ -1864,54 +1862,6 @@ def _from_pretrained(
18641862
if isinstance(init_kwargs["auto_map"], (tuple, list)):
18651863
init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
18661864

1867-
if config_tokenizer_class is None:
1868-
# Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
1869-
# If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
1870-
# AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
1871-
# Maybe we can just remove this entirely?
1872-
from .models.auto.configuration_auto import AutoConfig # tests_ignore
1873-
1874-
# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
1875-
try:
1876-
config = AutoConfig.from_pretrained(
1877-
pretrained_model_name_or_path,
1878-
token=token,
1879-
cache_dir=cache_dir,
1880-
local_files_only=local_files_only,
1881-
trust_remote_code=trust_remote_code,
1882-
_commit_hash=_commit_hash,
1883-
)
1884-
config_tokenizer_class = config.tokenizer_class
1885-
except (OSError, ValueError, KeyError):
1886-
# skip if an error occurred.
1887-
config = None
1888-
if config_tokenizer_class is None:
1889-
# Third attempt. If we have not yet found the original type of the tokenizer,
1890-
# we are loading we see if we can infer it from the type of the configuration file
1891-
from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore
1892-
1893-
if hasattr(config, "model_type"):
1894-
model_type = config.model_type
1895-
else:
1896-
# Fallback: use pattern matching on the string.
1897-
model_type = None
1898-
for pattern in TOKENIZER_MAPPING_NAMES:
1899-
if pattern in str(pretrained_model_name_or_path):
1900-
model_type = pattern
1901-
break
1902-
1903-
if model_type is not None:
1904-
config_tokenizer_class = TOKENIZER_MAPPING_NAMES.get(model_type)
1905-
1906-
if config_tokenizer_class is not None:
1907-
if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
1908-
logger.warning(
1909-
"The tokenizer class you load from this checkpoint is not the same type as the class this"
1910-
" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
1911-
f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
1912-
f" from is '{cls.__name__}'."
1913-
)
1914-
19151865
# Preserve extra_special_tokens from tokenizer_config.json before updating with kwargs
19161866
# extra_special_tokens should be a list (user-defined extra tokens)
19171867
extra_special_tokens_from_config = init_kwargs.get("extra_special_tokens")

0 commit comments

Comments
 (0)