@@ -1827,16 +1827,14 @@ def _from_pretrained(
18271827 if tokenizer_config_file is not None :
18281828 with open (tokenizer_config_file , encoding = "utf-8" ) as tokenizer_config_handle :
18291829 init_kwargs = json .load (tokenizer_config_handle )
1830- # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
1831- config_tokenizer_class = init_kwargs .get ("tokenizer_class" )
1830+ # used in the past to check if the tokenizer class matches the class in the repo
18321831 init_kwargs .pop ("tokenizer_class" , None )
18331832 if not has_tokenizer_file :
18341833 init_kwargs .get ("tokenizer_file" , None )
18351834 saved_init_inputs = init_kwargs .pop ("init_inputs" , ())
18361835 if not init_inputs :
18371836 init_inputs = saved_init_inputs
18381837 else :
1839- config_tokenizer_class = None
18401838 init_kwargs = init_configuration
18411839
18421840 # If independent chat template file(s) exist, they take priority over template entries in the tokenizer config
@@ -1864,54 +1862,6 @@ def _from_pretrained(
18641862 if isinstance (init_kwargs ["auto_map" ], (tuple , list )):
18651863 init_kwargs ["auto_map" ] = {"AutoTokenizer" : init_kwargs ["auto_map" ]}
18661864
1867- if config_tokenizer_class is None :
1868- # Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
1869- # If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
1870- # AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
1871- # Maybe we can just remove this entirely?
1872- from .models .auto .configuration_auto import AutoConfig # tests_ignore
1873-
1874- # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
1875- try :
1876- config = AutoConfig .from_pretrained (
1877- pretrained_model_name_or_path ,
1878- token = token ,
1879- cache_dir = cache_dir ,
1880- local_files_only = local_files_only ,
1881- trust_remote_code = trust_remote_code ,
1882- _commit_hash = _commit_hash ,
1883- )
1884- config_tokenizer_class = config .tokenizer_class
1885- except (OSError , ValueError , KeyError ):
1886- # skip if an error occurred.
1887- config = None
1888- if config_tokenizer_class is None :
1889- # Third attempt. If we have not yet found the original type of the tokenizer,
1890- # we are loading we see if we can infer it from the type of the configuration file
1891- from .models .auto .tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore
1892-
1893- if hasattr (config , "model_type" ):
1894- model_type = config .model_type
1895- else :
1896- # Fallback: use pattern matching on the string.
1897- model_type = None
1898- for pattern in TOKENIZER_MAPPING_NAMES :
1899- if pattern in str (pretrained_model_name_or_path ):
1900- model_type = pattern
1901- break
1902-
1903- if model_type is not None :
1904- config_tokenizer_class = TOKENIZER_MAPPING_NAMES .get (model_type )
1905-
1906- if config_tokenizer_class is not None :
1907- if cls .__name__ .replace ("Fast" , "" ) != config_tokenizer_class .replace ("Fast" , "" ):
1908- logger .warning (
1909- "The tokenizer class you load from this checkpoint is not the same type as the class this"
1910- " function is called from. It may result in unexpected tokenization. \n The tokenizer class you"
1911- f" load from this checkpoint is '{ config_tokenizer_class } '. \n The class this function is called"
1912- f" from is '{ cls .__name__ } '."
1913- )
1914-
19151865 # Preserve extra_special_tokens from tokenizer_config.json before updating with kwargs
19161866 # extra_special_tokens should be a list (user-defined extra tokens)
19171867 extra_special_tokens_from_config = init_kwargs .get ("extra_special_tokens" )
0 commit comments