-
Notifications
You must be signed in to change notification settings - Fork 31.2k
rm slow tokenizers #40936
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
rm slow tokenizers #40936
Changes from 29 commits
5fe5666
51e62e1
0e5dbdf
9136d3c
ab77f57
36bc3ef
c4f045c
c80dd1d
790c092
f4d956a
b2c320c
0c3caff
d43412a
48eeb50
d3a3cbd
493f9e0
a51cea0
d9c1ec3
d879bc3
ca51029
132c617
ed5bf86
158b444
64eaf88
95f48d3
f3248d2
f3dd103
c66037d
cb5e08b
3159033
a14a45d
7ca10f8
f5cbc49
72e8043
3cd8e5b
4bf2b85
2ef0fd3
5c7d347
fcf67ff
a8ccf16
ccca98e
c118c10
0f74081
a11dba7
b678cde
ea9a546
ffbdecf
9f08ade
a7cd5c0
b5b3cd9
1250bcc
cf72cae
4fafdcc
236f9f1
cd743bf
0e7e593
2af6d2c
b58b7b1
518dcaf
0f2f4b6
c849148
0d54bbd
81a140a
61366d6
4374a66
df383d7
b8035ec
37e1b92
9b45774
a24856d
1868870
35dd250
b0428f3
8fe6873
c1e0e46
79568cd
cfa159a
5854f4c
714a856
c016f11
2e3e178
03e3ab9
2c30d79
98f51d5
96f0517
c26f54b
da0bbf0
494ef3e
39bb884
960dfcf
54992a0
d0383bd
a969c6b
2bf4a13
e88322f
cfb0100
0fd1066
02c524c
820191e
0cd714d
8a412bc
e8c3258
85a3b1f
45e718f
96fc467
7727e3b
6795515
2f49a39
a42e7a8
33634be
ca5e389
25021d4
69610fe
51799ca
f23abc3
88f0db5
077e6f8
e004b56
e069763
9df9cfc
dc9b1ae
4c05e9d
5c209a4
d8a8db8
6b40d91
976265b
b6ca8b2
5c72105
964b461
0381407
887b477
f4c46ab
efbbb04
71ef282
a5b018c
8ea91f6
1947894
20a06ff
aa197a0
63c7c1c
5895bab
6b8217b
184ed58
09e4021
a8c299e
1259052
06e3485
3a95bf1
05d5c08
78f4e58
8fbaf83
fd40b1b
70330b8
7c78007
ca1f6b0
dd3ae59
2e1893f
f4be6a9
919103a
c452f92
6d167eb
025722b
7d1d0d3
dfb67a4
51da6b2
c611058
cc4a972
51202da
ca988b9
f5bc69e
c67de10
045bbff
75662fd
8d248a3
4c29924
189cabd
e02741c
b828ae1
83b579c
2ce27bc
881b97c
2e28b3d
925d187
6624231
437321b
cd4d3ac
5c5864f
2f13c13
1e1aa11
5eeb1fe
dea8e1e
452d6d8
e650205
3dd1716
e700dfa
ce23d67
ff1bf36
0bdfeae
a137649
366597c
22887b1
73819f4
c24c997
551a959
a18e84d
c23ee13
93187b3
81428ef
eb95c2e
24d89c4
e2c4434
7a737b7
a19c90c
18e7484
3cdd8ee
50756c4
6bccb46
a76015a
4afb570
be1d95a
fad31d7
d4aff20
3ab4bec
0c1a40a
30f1640
f2a1482
d8010f8
82e5675
088fc39
f677ddf
d30e46b
ad24f43
ebfe7f1
c4a743d
f81a966
9a5638d
7c32dfb
c520a66
718b2f0
20d9036
8f536c2
e96c18b
5ce65b8
4418e8a
7f9954a
bfa5fd0
4dce834
fcdc9bb
a5a3a7c
0244be9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
itazap marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,185 @@ | ||
| # Copyright 20125 The HuggingFace Inc. team. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| """ | ||
| Utilities for creating fast tokenizers from scratch. | ||
| """ | ||
|
|
||
| from typing import Optional | ||
|
|
||
| from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers | ||
| from tokenizers.models import BPE, Unigram | ||
| from .utils import is_protobuf_available, is_sentencepiece_available, logging, requires_backends | ||
|
|
||
|
|
||
| def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: | ||
| if add_prefix_space: | ||
| prepend_scheme = "always" | ||
| if not getattr(original_tokenizer, "legacy", True): | ||
| prepend_scheme = "first" | ||
| else: | ||
| prepend_scheme = "never" | ||
| return prepend_scheme | ||
|
|
||
|
|
||
| def generate_merges(vocab, vocab_scores: Optional[dict[str, float]] = None): | ||
| reverse = vocab_scores is not None | ||
| vocab_scores = dict(vocab_scores) if reverse else vocab | ||
|
|
||
| merges = [] | ||
| for merge, piece_score in vocab_scores.items(): | ||
| local = [] | ||
| for index in range(1, len(merge)): | ||
| piece_l, piece_r = merge[:index], merge[index:] | ||
| if piece_l in vocab and piece_r in vocab: | ||
| local.append((piece_l, piece_r, piece_score)) | ||
| local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]])) | ||
| merges.extend(local) | ||
|
|
||
| merges = sorted(merges, key=lambda val: (val[2], len(val[0]), len(val[1])), reverse=reverse) | ||
| merges = [(val[0], val[1]) for val in merges] | ||
| return merges | ||
|
|
||
|
|
||
| class SentencePieceExtractor: | ||
| """ | ||
| Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece | ||
| """ | ||
|
|
||
| def __init__(self, model: str): | ||
| requires_backends(self, "sentencepiece") | ||
| from sentencepiece import SentencePieceProcessor | ||
|
|
||
| self.sp = SentencePieceProcessor() | ||
| self.sp.Load(model) | ||
|
|
||
| def extract(self, vocab_scores=None) -> tuple[dict[str, int], list[tuple]]: | ||
| """ | ||
| By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to | ||
| order the merges with respect to the piece scores instead. | ||
| """ | ||
| sp = self.sp | ||
| vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())} | ||
|
|
||
| # let's get the vocab_scores | ||
| vocab_scores = {sp.id_to_piece(i): sp.get_score(i) for i in range(sp.GetPieceSize())} | ||
|
|
||
| merges = generate_merges(vocab, vocab_scores) | ||
|
|
||
| return vocab, merges | ||
|
|
||
|
|
||
| class SpmTokenizer: | ||
| """ | ||
| Base SentencePiece tokenizer that can be instantiated with model-specific arguments. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| handle_byte_fallback: bool = True, | ||
| legacy: bool = False, | ||
| add_prefix_space: bool = True, | ||
| special_tokens: Optional[dict] = None, | ||
| vocab: Optional[callable] = None, | ||
| unk_id: Optional[callable] = None, | ||
| normalizer: Optional[callable] = None, | ||
| pre_tokenizer: Optional[callable] = None, | ||
| decoder: Optional[callable] = None, | ||
| post_processor: Optional[callable] = None, | ||
| tokenizer: Optional[callable] = None, | ||
| ): | ||
| self.handle_byte_fallback = handle_byte_fallback | ||
| self.legacy = legacy | ||
| self.add_prefix_space = add_prefix_space | ||
| self.special_tokens = special_tokens or {} | ||
| # Store user-provided callables under private names to avoid clashing with methods | ||
| self._vocab_fn = vocab | ||
| self._unk_id_fn = unk_id | ||
| self._normalizer_fn = normalizer | ||
| self._pre_tokenizer_fn = pre_tokenizer | ||
| self._decoder_fn = decoder | ||
| self._post_processor_fn = post_processor | ||
| self._tokenizer_fn = tokenizer | ||
|
|
||
| def vocab(self): | ||
| if self._vocab_fn is not None: | ||
| return self._vocab_fn() | ||
| # Return empty vocab for training | ||
| return [] | ||
|
|
||
| def unk_id(self): | ||
| if self._unk_id_fn is not None: | ||
| return self._unk_id_fn() | ||
| return 0 # Default unk_id | ||
|
|
||
| def tokenizer(self): | ||
| # Always create empty trainable tokenizer | ||
| minimal_vocab = [("<unk>", 0.0)] | ||
| return Tokenizer(Unigram(minimal_vocab, unk_id=self.unk_id(), byte_fallback=self.handle_byte_fallback)) | ||
|
|
||
| def normalizer(self): | ||
| if self._normalizer_fn is not None: | ||
| return self._normalizer_fn() | ||
| _normalizers = [ | ||
| normalizers.Strip(left=False, right=True), | ||
| normalizers.Replace(Regex(" {2,}"), "▁"), | ||
| ] | ||
| return normalizers.Sequence(_normalizers) | ||
itazap marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| def pre_tokenizer(self, replacement, add_prefix_space): | ||
| if self._pre_tokenizer_fn is not None: | ||
| return self._pre_tokenizer_fn(replacement, add_prefix_space) | ||
|
|
||
| prepend_scheme = _get_prepend_scheme(add_prefix_space, self) | ||
| return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) | ||
|
|
||
| def decoder(self, replacement, add_prefix_space): | ||
| if self._decoder_fn is not None: | ||
| return self._decoder_fn(replacement, add_prefix_space) | ||
|
|
||
| prepend_scheme = _get_prepend_scheme(add_prefix_space, self) | ||
| return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme) | ||
|
|
||
| def post_processor(self): | ||
| if self._post_processor_fn is not None: | ||
| return self._post_processor_fn() | ||
| return None | ||
|
|
||
| def create_tokenizer(self) -> Tokenizer: | ||
| """Create and return the configured empty trainable tokenizer.""" | ||
| if self._tokenizer_fn is not None: | ||
| tokenizer = self._tokenizer_fn() | ||
| else: | ||
| tokenizer = self.tokenizer() | ||
|
|
||
| # Tokenizer assemble | ||
| normalizer = self.normalizer() | ||
| if normalizer is not None: | ||
| tokenizer.normalizer = normalizer | ||
|
|
||
| replacement = "▁" | ||
| add_prefix_space = self.add_prefix_space | ||
|
|
||
| pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) | ||
| if pre_tokenizer is not None: | ||
| tokenizer.pre_tokenizer = pre_tokenizer | ||
|
|
||
| tokenizer.decoder = self.decoder(replacement, add_prefix_space) | ||
| post_processor = self.post_processor() | ||
| if post_processor: | ||
| tokenizer.post_processor = post_processor | ||
|
|
||
| return tokenizer | ||
|
|
||
|
|
||
| __all__ = ["SpmTokenizer", "_get_prepend_scheme"] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,11 +26,13 @@ | |
| from ...configuration_utils import PretrainedConfig | ||
| from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code | ||
| from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint | ||
| from ...tokenization_sentencepiece import PreTrainedSentencePieceTokenizer | ||
| from ...tokenization_utils import PreTrainedTokenizer | ||
| from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE | ||
| from ...utils import ( | ||
| cached_file, | ||
| extract_commit_hash, | ||
| has_file, | ||
| is_g2p_en_available, | ||
| is_sentencepiece_available, | ||
| is_tokenizers_available, | ||
|
|
@@ -46,13 +48,11 @@ | |
| replace_list_option_in_docstrings, | ||
| ) | ||
|
|
||
|
|
||
| if is_tokenizers_available(): | ||
| from ...tokenization_utils_fast import PreTrainedTokenizerFast | ||
| else: | ||
| PreTrainedTokenizerFast = None | ||
|
|
||
|
|
||
| logger = logging.get_logger(__name__) | ||
|
|
||
| # Explicit rather than inferred generics to significantly improves completion suggestion performance for language servers. | ||
|
|
@@ -356,7 +356,7 @@ | |
| ( | ||
| "llama", | ||
| ( | ||
| "LlamaTokenizer" if is_sentencepiece_available() else None, | ||
| None, | ||
| "LlamaTokenizerFast" if is_tokenizers_available() else None, | ||
itazap marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ), | ||
| ), | ||
|
|
@@ -818,16 +818,16 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: | |
|
|
||
|
|
||
| def get_tokenizer_config( | ||
| pretrained_model_name_or_path: Union[str, os.PathLike[str]], | ||
| cache_dir: Optional[Union[str, os.PathLike[str]]] = None, | ||
| force_download: bool = False, | ||
| resume_download: Optional[bool] = None, | ||
| proxies: Optional[dict[str, str]] = None, | ||
| token: Optional[Union[bool, str]] = None, | ||
| revision: Optional[str] = None, | ||
| local_files_only: bool = False, | ||
| subfolder: str = "", | ||
| **kwargs, | ||
| pretrained_model_name_or_path: Union[str, os.PathLike[str]], | ||
| cache_dir: Optional[Union[str, os.PathLike[str]]] = None, | ||
| force_download: bool = False, | ||
| resume_download: Optional[bool] = None, | ||
| proxies: Optional[dict[str, str]] = None, | ||
| token: Optional[Union[bool, str]] = None, | ||
| revision: Optional[str] = None, | ||
| local_files_only: bool = False, | ||
| subfolder: str = "", | ||
| **kwargs, | ||
| ) -> dict[str, Any]: | ||
| """ | ||
| Loads the tokenizer configuration from a pretrained model tokenizer configuration. | ||
|
|
@@ -1098,11 +1098,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
|
|
||
| has_remote_code = tokenizer_auto_map is not None | ||
| has_local_code = type(config) in TOKENIZER_MAPPING or ( | ||
| config_tokenizer_class is not None | ||
| and ( | ||
| tokenizer_class_from_name(config_tokenizer_class) is not None | ||
| or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None | ||
| ) | ||
| config_tokenizer_class is not None | ||
| and ( | ||
| tokenizer_class_from_name(config_tokenizer_class) is not None | ||
| or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None | ||
| ) | ||
| ) | ||
| if has_remote_code: | ||
| if use_fast and tokenizer_auto_map[1] is not None: | ||
|
|
@@ -1133,6 +1133,31 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): | |
| tokenizer_class_candidate = config_tokenizer_class | ||
| tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) | ||
| if tokenizer_class is None: | ||
| try: | ||
| vocab_file_exists = has_file( | ||
| pretrained_model_name_or_path, | ||
| "tokenizer.model", | ||
| revision=kwargs.get("revision", None), | ||
| token=kwargs.get("token", None), | ||
| cache_dir=kwargs.get("cache_dir", None), | ||
| local_files_only=kwargs.get("local_files_only", False), | ||
| ) | ||
| except Exception: | ||
| vocab_file_exists = False | ||
|
|
||
| if vocab_file_exists: | ||
| logger.info( | ||
| "Falling back to PreTrainedSentencePieceTokenizer since tokenizer.model file was found " | ||
| "but no config or tokenizer class could be determined." | ||
| ) | ||
|
||
| return PreTrainedSentencePieceTokenizer.from_pretrained( | ||
| pretrained_model_name_or_path, *inputs, **kwargs | ||
| ) | ||
|
|
||
| raise ValueError( | ||
| f"Could not load tokenizer from {pretrained_model_name_or_path}. " | ||
| "No tokenizer configuration or model config could be found." | ||
| ) | ||
| raise ValueError( | ||
| f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." | ||
| ) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.