diff --git a/MIGRATION_GUIDE_V5.md b/MIGRATION_GUIDE_V5.md index 9c73c84f6414..474aa478e2f3 100644 --- a/MIGRATION_GUIDE_V5.md +++ b/MIGRATION_GUIDE_V5.md @@ -74,6 +74,207 @@ While this is being implemented, expect varying levels of support across differe Linked PR: https://github.com/huggingface/transformers/pull/41580 + + + +## Tokenization + +Just as we moved towards a single backend library for model definition, we want `Tokenizer` to be a lot more intuitive. +With v5, you can now initialize an empty `LlamaTokenizer` and train it directly on your new task! + +Defining a new tokenizer object should be as simple as this: +```python +from transformers import TokenizersBackend, generate_merges +from tokenizers import pre_tokenizers, Tokenizer +from tokenizers.model import BPE + +class Llama5Tokenizer(TokenizersBackend): + def __init__(self, unk_token="",bos_token="", eos_token="", vocab=None, merges=None ): + if vocab is None: + self._vocab = { + str(unk_token): 0, + str(bos_token): 1, + str(eos_token): 2, + } + + else: + self._vocab = vocab + + if merges is not None: + self._merges = merges + else: + self._merges = generate_merges(filtered_vocab) + + self._tokenizer = Tokenizer( + BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True) + ) + self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( + replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False + ) + super().__init__( + tokenizer_object=self._tokenizer, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + ) +``` + +And now if you call `Llama5Tokenizer()` you just get an empty, trainable tokenizer that follows the definition of the authors of `Llama5` (it does not exist yet :wink:). + +The above is the main motivation towards refactoring tokenization: we want people to just instantiate a tokenizer like they would a model, empty or not and with exactly what they defined. + +### Non-tokenizers +If you tokenizers is not common, or you just don't want to rely on `sentencepiece` nor `tokenizers` you can just import the `PythonBackend` (previousl `PreTrainedTokenzier`) which has all the API and logic for added tokens, encoding and decoding wieht them etc. + +If you want to have en less features, you can use the common `PreTrainedTokenizerBase` mixin, which mostly defines `transformers` tokenizer API: `encode`, `decode`, `vocab_size`, `get_vocab`, `convert_tokens_to_ids`, `convert_ids_to_tokens`, `from_pretrained`, `save_pretrained`, etc. + +### Backend Architecture Changes + +**Moving away from "slow" vs "fast" tokenizers:** + +Previously, transformers maintained two parallel implementations for many tokenizers: +- "Slow" tokenizers (`tokenization_.py`) - Python-based implementations, often using [SentencePiece](https://github.com/google/sentencepiece) as the backend. +- "Fast" tokenizers (`tokenization__fast.py`) - Rust-based implementations using the 🤗 [tokenizers](https://github.com/huggingface/tokenizers) library. + +In v5, we consolidate to a single tokenizer file per model: `tokenization_.py`. This file will use the most appropriate backend available: + +1. **TokenizersBackend** (preferred): Rust-based tokenizers from the 🤗 [tokenizers](https://github.com/huggingface/tokenizers) library. In general its performances are better, but it also offers a lot more features that are comonly adopted across the ecosystem, like handling additional tokens, easily update the state of the tokenizer, automatic parallelisation etc. +2. **SentencePieceBackend**: For models requiring SentencePiece +3. **PythonBackend**: Pure Python implementations +4. **MistralCommonBackend**: Relies on `MistralCommon`'s toknenization library. (Previously `MistralCommonTokenizer`) + +The `AutoTokenizer` automatically selects the appropriate backend based on available files and dependencies. This is transparent, you continue to use `AutoTokenizer.from_pretrained()` as before. This allows transformers to be future-proof and modular to easily support future backends. + + +### API Changes + +**1. Direct tokenizer initialization with vocab and merges:** + +In v5, you can now initialize tokenizers directly with vocabulary and merges, enabling training custom tokenizers from scratch: + +```python +# v5: Initialize a blank tokenizer for training +from transformers import LlamaTokenizer + +# Create a tokenizer with custom vocabulary and merges +vocab = {"": 0, "": 1, "": 2, "hello": 3, "world": 4} +merges = [("h", "e"), ("l", "l"), ("o", " ")] + +tokenizer = LlamaTokenizer(vocab=vocab, merges=merges) + +# Or initialize a blank tokenizer to train on your own dataset +tokenizer = LlamaTokenizer() # Creates a blank Llama-like tokenizer +``` +But you can no longer pass a vocab file. As this accounts for `from_pretrained` use-case. + +**2. Simplified decoding API:** + +The `batch_decode` method has been unified with `decode`. Both single and batch decoding now use the same method: +```python +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("t5-small") +inputs = ["hey how are you?", "fine"] +tokenizer.decode(tokenizer.encode(inputs)) +``` +Gives: +```diff +- 'hey how are you? fine' ++ ['hey how are you?', 'fine'] +``` + +This is mostly because people get `list[list[int]]` out of `generate`, but then they would use `decode` because they use `encode` and would get: +```python + ...: tokenizer.decode([[1,2], [1,4]]) +--------------------------------------------------------------------------- +TypeError Traceback (most recent call last) +Cell In[2], line 4 + 2 tokenizer = AutoTokenizer.from_pretrained("t5-small") + 3 inputs = ["hey how are you?", "fine"] +----> 4 tokenizer.decode([[1,2], [1,4]]) + +File /raid/arthur/transformers/src/transformers/tokenization_utils_base.py:3948, in PreTrainedTokenizerBase.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs) + 3945 # Convert inputs to python lists + 3946 token_ids = to_py_obj(token_ids) +-> 3948 return self._decode( + 3949 token_ids=token_ids, + 3950 skip_special_tokens=skip_special_tokens, + 3951 clean_up_tokenization_spaces=clean_up_tokenization_spaces, + 3952 **kwargs, + 3953 ) + +File /raid/arthur/transformers/src/transformers/tokenization_utils_fast.py:682, in PreTrainedTokenizerFast._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs) + 680 if isinstance(token_ids, int): + 681 token_ids = [token_ids] +--> 682 text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + 684 clean_up_tokenization_spaces = ( + 685 clean_up_tokenization_spaces + 686 if clean_up_tokenization_spaces is not None + 687 else self.clean_up_tokenization_spaces + 688 ) + 689 if clean_up_tokenization_spaces: + +TypeError: argument 'ids': 'list' object cannot be interpreted as an integer +``` + +**3. Unified encoding API:** + +The `encode_plus` is deprecated → call directly with `__call__` + +**3. `apply_chat_template` returns `BatchEncoding`:** + +Previously, `apply_chat_template` returned `input_ids` for backward compatibility. In v5, it now consistently returns a `BatchEncoding` dict like other tokenizer methods: + +```python +# v5 +messages = [ + {"role": "user", "content": "Hello!"}, + {"role": "assistant", "content": "Hi there!"} +] + +# Now returns BatchEncoding with input_ids, attention_mask, etc. +outputs = tokenizer.apply_chat_template(messages, return_tensors="pt") +print(outputs.keys()) # dict_keys(['input_ids', 'attention_mask']) +``` + +#### Removed legacy configuration file saving: + +- `special_tokens_map.json` - special tokens are now stored in `tokenizer_config.json`. +- `added_tokens.json` - added tokens are now stored in `tokenizer.json`. +- `added_tokens_decoder` is only stored when there is no `tokenizer.json`. + +When loading older tokenizers, these files are still read for backward compatibility, but new saves use the consolidated format. + +### Model-Specific Changes + +Several models that had identical tokenizers now import from their base implementation: + +- **LayoutLM** → uses BertTokenizer +- **LED** → uses BartTokenizer +- **Longformer** → uses RobertaTokenizer +- **LXMert** → uses BertTokenizer +- **MT5** → uses T5Tokenizer +- **MVP** → uses BartTokenizer + +We're just gonna remove these files at term. + +**Removed T5-specific workarounds:** + +The internal `_eventually_correct_t5_max_length` method has been removed. T5 tokenizers now handle max length consistently with other models. + +### Testing Changes + +Model-specific tokenization test files now focus on integration tests. +Common tokenization API tests (e.g., `add_tokens`, `encode`, `decode`) are now centralized and automatically applied across all tokenizers. This reduces test duplication and ensures consistent behavior + + +For legacy implementations, the original BERT Python tokenizer code (including `WhitespaceTokenizer`, `BasicTokenizer`, etc.) is preserved in `bert_legacy.py` for reference purposes. + +**Linked PRs:** +- https://github.com/huggingface/transformers/issues/40938 +- https://github.com/huggingface/transformers/pull/40936 +- https://github.com/huggingface/transformers/pull/41626 + + ## Library-wide changes with lesser impact ### `use_auth_token` diff --git a/docs/source/en/internal/tokenization_utils.md b/docs/source/en/internal/tokenization_utils.md index 5aa650991760..ba2a69552a22 100644 --- a/docs/source/en/internal/tokenization_utils.md +++ b/docs/source/en/internal/tokenization_utils.md @@ -18,8 +18,7 @@ rendered properly in your Markdown viewer. This page lists all the utility functions used by the tokenizers, mainly the class [`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between -[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin -[`~tokenization_utils_base.SpecialTokensMixin`]. +[`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]. Most of those are only useful if you are studying the code of the tokenizers in the library. @@ -29,9 +28,6 @@ Most of those are only useful if you are studying the code of the tokenizers in - __call__ - all -## SpecialTokensMixin - -[[autodoc]] tokenization_utils_base.SpecialTokensMixin ## Enums and namedtuples diff --git a/docs/source/en/main_classes/tokenizer.md b/docs/source/en/main_classes/tokenizer.md index 3f16bfbfeda5..76492abe464b 100644 --- a/docs/source/en/main_classes/tokenizer.md +++ b/docs/source/en/main_classes/tokenizer.md @@ -28,8 +28,7 @@ The base classes [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository). They both rely on -[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods, and -[`~tokenization_utils_base.SpecialTokensMixin`]. +[`~tokenization_utils_base.PreTrainedTokenizerBase`] that contains the common methods. [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] thus implement the main methods for using all the tokenizers: @@ -98,6 +97,18 @@ loaded very simply into 🤗 transformers. Take a look at the [Using tokenizers - push_to_hub - all +## PythonBackend + +[[autodoc]] PythonBackend + +## TokenizersBackend + +[[autodoc]] TokenizersBackend + +## SentencePieceBackend + +[[autodoc]] SentencePieceBackend + ## BatchEncoding [[autodoc]] BatchEncoding diff --git a/docs/source/en/model_doc/bert.md b/docs/source/en/model_doc/bert.md index 97637e98e1f3..fdda4ef243a4 100644 --- a/docs/source/en/model_doc/bert.md +++ b/docs/source/en/model_doc/bert.md @@ -100,11 +100,13 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran ## BertTokenizer [[autodoc]] BertTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary +## BertTokenizerLegacy + +[[autodoc]] BertTokenizerLegacy + ## BertTokenizerFast [[autodoc]] BertTokenizerFast diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md index c3137725814a..1c308e8887be 100644 --- a/docs/source/en/model_doc/big_bird.md +++ b/docs/source/en/model_doc/big_bird.md @@ -104,9 +104,7 @@ print(f"The predicted token is: {predicted_token}") ## BigBirdTokenizer [[autodoc]] BigBirdTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## BigBirdTokenizerFast diff --git a/docs/source/en/model_doc/blenderbot-small.md b/docs/source/en/model_doc/blenderbot-small.md index 830db710e039..fd44fd2a176c 100644 --- a/docs/source/en/model_doc/blenderbot-small.md +++ b/docs/source/en/model_doc/blenderbot-small.md @@ -68,9 +68,7 @@ the left. ## BlenderbotSmallTokenizer [[autodoc]] BlenderbotSmallTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## BlenderbotSmallTokenizerFast diff --git a/docs/source/en/model_doc/blenderbot.md b/docs/source/en/model_doc/blenderbot.md index 168c744235d8..c7ed00797803 100644 --- a/docs/source/en/model_doc/blenderbot.md +++ b/docs/source/en/model_doc/blenderbot.md @@ -84,12 +84,10 @@ An example: ## BlenderbotTokenizer [[autodoc]] BlenderbotTokenizer - - build_inputs_with_special_tokens ## BlenderbotTokenizerFast [[autodoc]] BlenderbotTokenizerFast - - build_inputs_with_special_tokens ## BlenderbotModel diff --git a/docs/source/en/model_doc/bloom.md b/docs/source/en/model_doc/bloom.md index 51e2970c25f6..a234211d095d 100644 --- a/docs/source/en/model_doc/bloom.md +++ b/docs/source/en/model_doc/bloom.md @@ -63,10 +63,6 @@ See also: [[autodoc]] BloomConfig - all -## BloomTokenizerFast - -[[autodoc]] BloomTokenizerFast - - all ## BloomModel diff --git a/docs/source/en/model_doc/camembert.md b/docs/source/en/model_doc/camembert.md index 8affbd73a570..832ed26e66d1 100644 --- a/docs/source/en/model_doc/camembert.md +++ b/docs/source/en/model_doc/camembert.md @@ -122,9 +122,7 @@ print(f"The predicted token is: {predicted_token}") ## CamembertTokenizer [[autodoc]] CamembertTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## CamembertTokenizerFast diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index 529194d32a37..b58502a2b453 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -99,9 +99,7 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ ## CLIPTokenizer [[autodoc]] CLIPTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## CLIPTokenizerFast diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index a46e1f05b32a..21773c59ae47 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -167,16 +167,12 @@ visualizer("""def func(a, b): ## CodeLlamaTokenizer [[autodoc]] CodeLlamaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## CodeLlamaTokenizerFast [[autodoc]] CodeLlamaTokenizerFast - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - update_post_processor - save_vocabulary diff --git a/docs/source/en/model_doc/codegen.md b/docs/source/en/model_doc/codegen.md index c341154921e3..0f09933f6351 100644 --- a/docs/source/en/model_doc/codegen.md +++ b/docs/source/en/model_doc/codegen.md @@ -77,7 +77,6 @@ hello_world() ## CodeGenTokenizer [[autodoc]] CodeGenTokenizer - - create_token_type_ids_from_sequences - save_vocabulary ## CodeGenTokenizerFast diff --git a/docs/source/en/model_doc/cohere.md b/docs/source/en/model_doc/cohere.md index 022a178b5cfa..05285413cc87 100644 --- a/docs/source/en/model_doc/cohere.md +++ b/docs/source/en/model_doc/cohere.md @@ -129,14 +129,10 @@ visualizer("Plants create energy through a process known as") [[autodoc]] CohereConfig -## CohereTokenizerFast - -[[autodoc]] CohereTokenizerFast - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - update_post_processor - - save_vocabulary +## CohereTokenizer + +[[autodoc]] CohereTokenizer + ## CohereModel diff --git a/docs/source/en/model_doc/convbert.md b/docs/source/en/model_doc/convbert.md index cabeb7140f0b..9efe1b9af958 100644 --- a/docs/source/en/model_doc/convbert.md +++ b/docs/source/en/model_doc/convbert.md @@ -62,9 +62,7 @@ ConvBERT training tips are similar to those of BERT. For usage tips refer to [BE ## ConvBertTokenizer [[autodoc]] ConvBertTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## ConvBertTokenizerFast diff --git a/docs/source/en/model_doc/deberta-v2.md b/docs/source/en/model_doc/deberta-v2.md index 6ec0c0e51176..d7a6fb45c6e2 100644 --- a/docs/source/en/model_doc/deberta-v2.md +++ b/docs/source/en/model_doc/deberta-v2.md @@ -125,16 +125,12 @@ print(f"Predicted label: {predicted_label}") ## DebertaV2Tokenizer [[autodoc]] DebertaV2Tokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## DebertaV2TokenizerFast [[autodoc]] DebertaV2TokenizerFast - - build_inputs_with_special_tokens - - create_token_type_ids_from_sequences ## DebertaV2Model diff --git a/docs/source/en/model_doc/deberta.md b/docs/source/en/model_doc/deberta.md index 08be80c19ff0..d9432fae366b 100644 --- a/docs/source/en/model_doc/deberta.md +++ b/docs/source/en/model_doc/deberta.md @@ -104,16 +104,12 @@ echo -e '{"text": "A soccer game with multiple people playing.", "text_pair": "S ## DebertaTokenizer [[autodoc]] DebertaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## DebertaTokenizerFast [[autodoc]] DebertaTokenizerFast - - build_inputs_with_special_tokens - - create_token_type_ids_from_sequences ## DebertaModel diff --git a/docs/source/en/model_doc/fnet.md b/docs/source/en/model_doc/fnet.md index e89a410b105b..973a4e8488ae 100644 --- a/docs/source/en/model_doc/fnet.md +++ b/docs/source/en/model_doc/fnet.md @@ -65,9 +65,7 @@ sequence length for fine-tuning and inference. ## FNetTokenizer [[autodoc]] FNetTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## FNetTokenizerFast diff --git a/docs/source/en/model_doc/funnel.md b/docs/source/en/model_doc/funnel.md index 57b011b9400c..ff731b7589ba 100644 --- a/docs/source/en/model_doc/funnel.md +++ b/docs/source/en/model_doc/funnel.md @@ -74,9 +74,7 @@ This model was contributed by [sgugger](https://huggingface.co/sgugger). The ori ## FunnelTokenizer [[autodoc]] FunnelTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## FunnelTokenizerFast diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md index b777fdd32014..53934a045fad 100644 --- a/docs/source/en/model_doc/gpt_neox.md +++ b/docs/source/en/model_doc/gpt_neox.md @@ -171,6 +171,10 @@ following speedups during training and inference. [[autodoc]] GPTNeoXConfig +## GPTNeoXTokenizer + +[[autodoc]] GPTNeoXTokenizer + ## GPTNeoXTokenizerFast [[autodoc]] GPTNeoXTokenizerFast diff --git a/docs/source/en/model_doc/led.md b/docs/source/en/model_doc/led.md index ce1baa619a88..9090b1f9dfef 100644 --- a/docs/source/en/model_doc/led.md +++ b/docs/source/en/model_doc/led.md @@ -152,9 +152,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ## LEDTokenizer [[autodoc]] LEDTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## LEDTokenizerFast diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index 20f483f64123..f753445572ef 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -131,17 +131,13 @@ visualizer("Plants create energy through a process known as") ## LlamaTokenizer [[autodoc]] LlamaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## LlamaTokenizerFast [[autodoc]] LlamaTokenizerFast - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - update_post_processor - save_vocabulary diff --git a/docs/source/en/model_doc/llama2.md b/docs/source/en/model_doc/llama2.md index c66667f235f6..db0349885848 100644 --- a/docs/source/en/model_doc/llama2.md +++ b/docs/source/en/model_doc/llama2.md @@ -147,17 +147,13 @@ visualizer("Plants create energy through a process known as") ## LlamaTokenizer [[autodoc]] LlamaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## LlamaTokenizerFast [[autodoc]] LlamaTokenizerFast - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - update_post_processor - save_vocabulary diff --git a/docs/source/en/model_doc/mbart.md b/docs/source/en/model_doc/mbart.md index 93b74d7b31b8..6a0b72e5de53 100644 --- a/docs/source/en/model_doc/mbart.md +++ b/docs/source/en/model_doc/mbart.md @@ -120,7 +120,6 @@ print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)) ## MBartTokenizer [[autodoc]] MBartTokenizer - - build_inputs_with_special_tokens ## MBartTokenizerFast diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 4c598fc79a71..3534ff26c427 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -136,9 +136,9 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl [[autodoc]] MistralConfig -## MistralCommonTokenizer +## MistralCommonBackend -[[autodoc]] MistralCommonTokenizer +[[autodoc]] MistralCommonBackend ## MistralModel diff --git a/docs/source/en/model_doc/mistral3.md b/docs/source/en/model_doc/mistral3.md index 774fda68852e..7911200ad744 100644 --- a/docs/source/en/model_doc/mistral3.md +++ b/docs/source/en/model_doc/mistral3.md @@ -242,9 +242,9 @@ messages = [ [[autodoc]] Mistral3Config -## MistralCommonTokenizer +## MistralCommonBackend -[[autodoc]] MistralCommonTokenizer +[[autodoc]] MistralCommonBackend ## Mistral3Model diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index f247a20f656f..4167a68063f7 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -198,9 +198,9 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MixtralConfig -## MistralCommonTokenizer +## MistralCommonBackend -[[autodoc]] MistralCommonTokenizer +[[autodoc]] MistralCommonBackend ## MixtralModel diff --git a/docs/source/en/model_doc/mpnet.md b/docs/source/en/model_doc/mpnet.md index 08a150146518..938ce6219af0 100644 --- a/docs/source/en/model_doc/mpnet.md +++ b/docs/source/en/model_doc/mpnet.md @@ -64,9 +64,7 @@ separate your segments with the separation token `tokenizer.sep_token` (or `[sep ## MPNetTokenizer [[autodoc]] MPNetTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## MPNetTokenizerFast diff --git a/docs/source/en/model_doc/mt5.md b/docs/source/en/model_doc/mt5.md index 4e652458e1b3..eda5fd867dd6 100644 --- a/docs/source/en/model_doc/mt5.md +++ b/docs/source/en/model_doc/mt5.md @@ -127,18 +127,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) [[autodoc]] MT5Config -## MT5Tokenizer - -[[autodoc]] MT5Tokenizer - -See [`T5Tokenizer`] for all details. - -## MT5TokenizerFast - -[[autodoc]] MT5TokenizerFast - -See [`T5TokenizerFast`] for all details. - ## MT5Model [[autodoc]] MT5Model diff --git a/docs/source/en/model_doc/nllb.md b/docs/source/en/model_doc/nllb.md index 29f9cfe873aa..dcb6e5d57031 100644 --- a/docs/source/en/model_doc/nllb.md +++ b/docs/source/en/model_doc/nllb.md @@ -151,7 +151,6 @@ visualizer("UN Chief says there is no military solution in Syria") ## NllbTokenizer [[autodoc]] NllbTokenizer - - build_inputs_with_special_tokens ## NllbTokenizerFast diff --git a/docs/source/en/model_doc/nougat.md b/docs/source/en/model_doc/nougat.md index 4025fac002c3..41ee4757b34a 100644 --- a/docs/source/en/model_doc/nougat.md +++ b/docs/source/en/model_doc/nougat.md @@ -111,6 +111,10 @@ The model is identical to [Donut](donut) in terms of architecture. [[autodoc]] NougatImageProcessorFast - preprocess +## NougatTokenizer + +[[autodoc]] NougatTokenizer + ## NougatTokenizerFast [[autodoc]] NougatTokenizerFast diff --git a/docs/source/en/model_doc/pixtral.md b/docs/source/en/model_doc/pixtral.md index bb175973bd23..548058c3ec18 100644 --- a/docs/source/en/model_doc/pixtral.md +++ b/docs/source/en/model_doc/pixtral.md @@ -138,9 +138,9 @@ print(output) [[autodoc]] PixtralVisionConfig -## MistralCommonTokenizer +## MistralCommonBackend -[[autodoc]] MistralCommonTokenizer +[[autodoc]] MistralCommonBackend ## PixtralVisionModel diff --git a/docs/source/en/model_doc/rembert.md b/docs/source/en/model_doc/rembert.md index 7a1f9930d1f1..237316dc4a31 100644 --- a/docs/source/en/model_doc/rembert.md +++ b/docs/source/en/model_doc/rembert.md @@ -62,17 +62,13 @@ also similar to the Albert one rather than the BERT one. ## RemBertTokenizer [[autodoc]] RemBertTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## RemBertTokenizerFast [[autodoc]] RemBertTokenizerFast - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## RemBertModel diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md index 896156520c5d..0a285eba9abb 100644 --- a/docs/source/en/model_doc/roberta.md +++ b/docs/source/en/model_doc/roberta.md @@ -99,15 +99,12 @@ echo -e "Plants create through a process known as photosynthesis." | tran ## RobertaTokenizer [[autodoc]] RobertaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## RobertaTokenizerFast [[autodoc]] RobertaTokenizerFast - - build_inputs_with_special_tokens ## RobertaModel diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md index 8415c94f8501..86b0293c4b9e 100644 --- a/docs/source/en/model_doc/seamless_m4t.md +++ b/docs/source/en/model_doc/seamless_m4t.md @@ -176,9 +176,7 @@ This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The o [[autodoc]] SeamlessM4TTokenizer - __call__ - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## SeamlessM4TTokenizerFast diff --git a/docs/source/en/model_doc/splinter.md b/docs/source/en/model_doc/splinter.md index c3ef982da915..83e500cd79e2 100644 --- a/docs/source/en/model_doc/splinter.md +++ b/docs/source/en/model_doc/splinter.md @@ -67,9 +67,7 @@ This model was contributed by [yuvalkirstain](https://huggingface.co/yuvalkirsta ## SplinterTokenizer [[autodoc]] SplinterTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## SplinterTokenizerFast diff --git a/docs/source/en/model_doc/squeezebert.md b/docs/source/en/model_doc/squeezebert.md index 70e409daf717..251874ae9a12 100644 --- a/docs/source/en/model_doc/squeezebert.md +++ b/docs/source/en/model_doc/squeezebert.md @@ -70,9 +70,7 @@ This model was contributed by [forresti](https://huggingface.co/forresti). ## SqueezeBertTokenizer [[autodoc]] SqueezeBertTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## SqueezeBertTokenizerFast diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md index 00c8c418527d..ef62fff9d21b 100644 --- a/docs/source/en/model_doc/t5.md +++ b/docs/source/en/model_doc/t5.md @@ -118,9 +118,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True)) ## T5Tokenizer [[autodoc]] T5Tokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## T5TokenizerFast diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md index 0b63660ddaa6..088c5e1b2462 100644 --- a/docs/source/en/model_doc/whisper.md +++ b/docs/source/en/model_doc/whisper.md @@ -100,9 +100,7 @@ transcription[0] [[autodoc]] WhisperTokenizer - set_prefix_tokens - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary - batch_decode - decode @@ -113,9 +111,7 @@ transcription[0] [[autodoc]] WhisperTokenizerFast - set_prefix_tokens - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary - batch_decode - decode diff --git a/docs/source/en/model_doc/xglm.md b/docs/source/en/model_doc/xglm.md index 370055c90ea0..6e5c1648cd46 100644 --- a/docs/source/en/model_doc/xglm.md +++ b/docs/source/en/model_doc/xglm.md @@ -57,9 +57,7 @@ This model was contributed by [Suraj](https://huggingface.co/valhalla). The orig ## XGLMTokenizer [[autodoc]] XGLMTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## XGLMTokenizerFast diff --git a/docs/source/en/model_doc/xlm-roberta.md b/docs/source/en/model_doc/xlm-roberta.md index 0e9867636892..a57210cd8351 100644 --- a/docs/source/en/model_doc/xlm-roberta.md +++ b/docs/source/en/model_doc/xlm-roberta.md @@ -179,9 +179,7 @@ This implementation is the same as RoBERTa. Refer to the [documentation of RoBER ## XLMRobertaTokenizer [[autodoc]] XLMRobertaTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## XLMRobertaTokenizerFast diff --git a/docs/source/en/model_doc/xlnet.md b/docs/source/en/model_doc/xlnet.md index b0e5ef1ed08f..c9a06977b985 100644 --- a/docs/source/en/model_doc/xlnet.md +++ b/docs/source/en/model_doc/xlnet.md @@ -69,9 +69,7 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o ## XLNetTokenizer [[autodoc]] XLNetTokenizer - - build_inputs_with_special_tokens - get_special_tokens_mask - - create_token_type_ids_from_sequences - save_vocabulary ## XLNetTokenizerFast diff --git a/docs/source/ja/internal/tokenization_utils.md b/docs/source/ja/internal/tokenization_utils.md index 8e36e4149e27..e8f44090074b 100644 --- a/docs/source/ja/internal/tokenization_utils.md +++ b/docs/source/ja/internal/tokenization_utils.md @@ -18,8 +18,7 @@ rendered properly in your Markdown viewer. このページには、トークナイザーによって使用されるすべてのユーティリティ関数 (主にクラス) がリストされます。 [`~tokenization_utils_base.PreTrainedTokenizerBase`] 間の共通メソッドを実装します。 -[`PreTrainedTokenizer`] と [`PreTrainedTokenizerFast`] およびミックスイン -[`~tokenization_utils_base.SpecialTokensMixin`]。 +[`PreTrainedTokenizer`] と [`PreTrainedTokenizerFast`] およびミックスイン。 これらのほとんどは、ライブラリ内のトークナイザーのコードを学習する場合にのみ役に立ちます。 @@ -29,10 +28,6 @@ rendered properly in your Markdown viewer. - __call__ - all -## SpecialTokensMixin - -[[autodoc]] tokenization_utils_base.SpecialTokensMixin - ## Enums and namedtuples [[autodoc]] tokenization_utils_base.TruncationStrategy diff --git a/docs/source/ja/main_classes/tokenizer.md b/docs/source/ja/main_classes/tokenizer.md index 1cf5885bc812..99eebc4a96e6 100644 --- a/docs/source/ja/main_classes/tokenizer.md +++ b/docs/source/ja/main_classes/tokenizer.md @@ -28,8 +28,7 @@ Rust ライブラリ [🤗 Tokenizers](https://github.com/huggingface/tokenizers モデル入力の文字列入力をエンコードし (以下を参照)、Python をインスタンス化/保存するための一般的なメソッドを実装します。 ローカル ファイルまたはディレクトリ、またはライブラリによって提供される事前トレーニング済みトークナイザーからの「高速」トークナイザー (HuggingFace の AWS S3 リポジトリからダウンロード)。二人とも頼りにしているのは、 -共通メソッドを含む [`~tokenization_utils_base.PreTrainedTokenizerBase`] -[`~tokenization_utils_base.SpecialTokensMixin`]。 +共通メソッドを含む [`~tokenization_utils_base.PreTrainedTokenizerBase`]。 したがって、[`PreTrainedTokenizer`] と [`PreTrainedTokenizerFast`] はメインを実装します。 すべてのトークナイザーを使用するためのメソッド: diff --git a/docs/source/ja/model_doc/bloom.md b/docs/source/ja/model_doc/bloom.md index 26d60ae7e5bb..d9f0e28eaf4e 100644 --- a/docs/source/ja/model_doc/bloom.md +++ b/docs/source/ja/model_doc/bloom.md @@ -56,12 +56,6 @@ BLOOM を使い始めるのに役立つ公式 Hugging Face およびコミュニ [[autodoc]] BloomConfig - all -## BloomTokenizerFast - -[[autodoc]] BloomTokenizerFast - - all - - ## BloomModel diff --git a/docs/source/ko/internal/tokenization_utils.md b/docs/source/ko/internal/tokenization_utils.md index b5b69910479a..561048127d1a 100644 --- a/docs/source/ko/internal/tokenization_utils.md +++ b/docs/source/ko/internal/tokenization_utils.md @@ -16,7 +16,7 @@ rendered properly in your Markdown viewer. # 토크나이저를 위한 유틸리티 [[utilities-for-tokenizers]] -이 페이지는 토크나이저에서 사용되는 모든 유틸리티 함수들을 나열하며, 주로 [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`] 사이의 공통 메소드를 구현하는 [`~tokenization_utils_base.PreTrainedTokenizerBase`] 클래스와 [`~tokenization_utils_base.SpecialTokensMixin`]을 다룹니다. +이 페이지는 토크나이저에서 사용되는 모든 유틸리티 함수들을 나열하며, 주로 [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`] 사이의 공통 메소드를 구현하는 [`~tokenization_utils_base.PreTrainedTokenizerBase`] 클래스 을 다룹니다. 이 함수들 대부분은 라이브러리의 토크나이저 코드를 연구할 때만 유용합니다. @@ -26,9 +26,6 @@ rendered properly in your Markdown viewer. - __call__ - all -## SpecialTokensMixin [[transformers.SpecialTokensMixin]] - -[[autodoc]] tokenization_utils_base.SpecialTokensMixin ## Enums 및 namedtuples [[transformers.tokenization_utils_base.TruncationStrategy]] diff --git a/docs/source/ko/main_classes/tokenizer.md b/docs/source/ko/main_classes/tokenizer.md index f87edbebeb00..307e34c83a11 100644 --- a/docs/source/ko/main_classes/tokenizer.md +++ b/docs/source/ko/main_classes/tokenizer.md @@ -21,7 +21,7 @@ rendered properly in your Markdown viewer. 1. 특히 배치 토큰화를 수행할 때 속도가 크게 향상됩니다. 2. 원본 문자열(문자 및 단어)과 토큰 공간 사이를 매핑하는 추가적인 메소드를 제공합니다. (예: 특정 문자를 포함하는 토큰의 인덱스를 얻거나, 특정 토큰에 해당하는 문자 범위를 가져오는 등). -기본 클래스인 [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`]는 문자열 입력을 인코딩하는 메소드를 구현하며(아래 참조), 로컬 파일이나 디렉토리, 또는 라이브러리에서 제공하는 사전 훈련된 토크나이저(HuggingFace의 AWS S3 저장소에서 다운로드된)로부터 파이썬 및 "Fast" 토크나이저를 인스턴스화하거나 저장하는 기능을 제공합니다. 이 두 클래스는 공통 메소드를 포함하는 [`~tokenization_utils_base.PreTrainedTokenizerBase`]와 [`~tokenization_utils_base.SpecialTokensMixin`]에 의존합니다. +기본 클래스인 [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`]는 문자열 입력을 인코딩하는 메소드를 구현하며(아래 참조), 로컬 파일이나 디렉토리, 또는 라이브러리에서 제공하는 사전 훈련된 토크나이저(HuggingFace의 AWS S3 저장소에서 다운로드된)로부터 파이썬 및 "Fast" 토크나이저를 인스턴스화하거나 저장하는 기능을 제공합니다. 이 두 클래스는 공통 메소드를 포함하는 [`~tokenization_utils_base.PreTrainedTokenizerBase`]에 의존합니다. [`PreTrainedTokenizer`]와 [`PreTrainedTokenizerFast`]는 모든 토크나이저에서 사용되는 주요 메소드들을 구현합니다: diff --git a/docs/source/ko/model_doc/cohere.md b/docs/source/ko/model_doc/cohere.md index b53738ded860..3c98a028064b 100644 --- a/docs/source/ko/model_doc/cohere.md +++ b/docs/source/ko/model_doc/cohere.md @@ -115,15 +115,7 @@ print(gen_text) ## CohereConfig[[transformers.CohereConfig]] [[autodoc]] CohereConfig - -## CohereTokenizerFast[[transformers.CohereTokenizerFast]] - -[[autodoc]] CohereTokenizerFast - - build_inputs_with_special_tokens - - get_special_tokens_mask - - create_token_type_ids_from_sequences - - update_post_processor - - save_vocabulary +ave_vocabulary ## CohereModel[[transformers.CohereModel]] diff --git a/docs/source/zh/internal/tokenization_utils.md b/docs/source/zh/internal/tokenization_utils.md index 9f216131c122..9bfb3ee5a98a 100644 --- a/docs/source/zh/internal/tokenization_utils.md +++ b/docs/source/zh/internal/tokenization_utils.md @@ -30,9 +30,6 @@ rendered properly in your Markdown viewer. - __call__ - all -## SpecialTokensMixin - -[[autodoc]] tokenization_utils_base.SpecialTokensMixin ## Enums和namedtuples(命名元组) diff --git a/docs/source/zh/main_classes/tokenizer.md b/docs/source/zh/main_classes/tokenizer.md index f89fc20b53d1..6b016fbece72 100644 --- a/docs/source/zh/main_classes/tokenizer.md +++ b/docs/source/zh/main_classes/tokenizer.md @@ -21,7 +21,7 @@ tokenizer负责准备输入以供模型使用。该库包含所有模型的token 1. 在批量分词时显著提速 2. 在原始字符串(字符和单词)和token空间之间进行映射的其他方法(例如,获取包含给定字符的token的索引或与给定token对应的字符范围)。 -基类 [PreTrainedTokenizer] 和 [PreTrained TokenizerFast] 实现了在模型输入中编码字符串输入的常用方法(见下文),并从本地文件或目录或从库提供的预训练的 tokenizer(从 HuggingFace 的 AWS S3 存储库下载)实例化/保存 python 和“Fast” tokenizer。它们都依赖于包含常用方法的 [`~tokenization_utils_base.PreTrainedTokenizerBase`]和[`~tokenization_utils_base.SpecialTokensMixin`]。 +基类 [PreTrainedTokenizer] 和 [PreTrained TokenizerFast] 实现了在模型输入中编码字符串输入的常用方法(见下文),并从本地文件或目录或从库提供的预训练的 tokenizer(从 HuggingFace 的 AWS S3 存储库下载)实例化/保存 python 和“Fast” tokenizer。它们都依赖于包含常用方法的 [`~tokenization_utils_base.PreTrainedTokenizerBase`]。 因此,[`PreTrainedTokenizer`] 和 [`PreTrainedTokenizerFast`] 实现了使用所有tokenizers的主要方法: diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 92d2cfd1949b..ac6136fe4c12 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -38,7 +38,6 @@ DataCollatorWithPadding, EvalPrediction, HfArgumentParser, - PreTrainedTokenizerFast, TrainingArguments, default_data_collator, set_seed, @@ -335,7 +334,8 @@ def main(): ) # Tokenizer check: this script requires a fast tokenizer. - if not isinstance(tokenizer, PreTrainedTokenizerFast): + # Check if tokenizer has _tokenizer attribute (from tokenizers library) or is_fast property + if not (hasattr(tokenizer, "_tokenizer") or getattr(tokenizer, "is_fast", False)): raise TypeError( "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" diff --git a/examples/pytorch/text-generation/run_generation.py b/examples/pytorch/text-generation/run_generation.py index 1552897ecd47..9d9477f8f099 100755 --- a/examples/pytorch/text-generation/run_generation.py +++ b/examples/pytorch/text-generation/run_generation.py @@ -37,7 +37,6 @@ from transformers import ( AutoTokenizer, BloomForCausalLM, - BloomTokenizerFast, CTRLLMHeadModel, CTRLTokenizer, GenerationMixin, @@ -72,7 +71,7 @@ "xlnet": (XLNetLMHeadModel, XLNetTokenizer), "xlm": (XLMWithLMHeadModel, XLMTokenizer), "gptj": (GPTJForCausalLM, AutoTokenizer), - "bloom": (BloomForCausalLM, BloomTokenizerFast), + "bloom": (BloomForCausalLM, AutoTokenizer), "llama": (LlamaForCausalLM, AutoTokenizer), "opt": (OPTForCausalLM, GPT2Tokenizer), } diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index bf172eb2567f..2f1ab1a1a090 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,6 @@ DataCollatorForTokenClassification, HfArgumentParser, PreTrainedConfig, - PreTrainedTokenizerFast, Trainer, TrainingArguments, set_seed, @@ -389,7 +388,8 @@ def get_label_list(labels): ) # Tokenizer check: this script requires a fast tokenizer. - if not isinstance(tokenizer, PreTrainedTokenizerFast): + # Check if tokenizer has _tokenizer attribute (from tokenizers library) or is_fast property + if not (hasattr(tokenizer, "_tokenizer") or getattr(tokenizer, "is_fast", False)): raise TypeError( "This example script only works for models that have a fast tokenizer. Check out the big table of models at" " https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet" diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index e1d3c4ca387a..c69b0c03847d 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -60,6 +60,7 @@ default_data_collator, set_seed, ) +from transformers.tokenization_utils_sentencepiece import SentencePieceBackend from transformers.utils import check_min_version from transformers.utils.versions import require_version @@ -403,7 +404,9 @@ def main(): model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id - if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): + if model.config.decoder_start_token_id is None and isinstance( + tokenizer, (MBartTokenizer, MBartTokenizerFast, SentencePieceBackend) + ): if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.target_lang] else: diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 07c616beb9a2..6dad9d7c05cb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -58,6 +58,7 @@ # Base objects, independent of any specific backend _import_structure = { "audio_utils": [], + "cli": [], "configuration_utils": ["PreTrainedConfig", "PretrainedConfig"], "convert_slow_tokenizers_checkpoints_to_fast": [], "data": [ @@ -172,13 +173,13 @@ "processing_utils": ["ProcessorMixin"], "quantizers": [], "testing_utils": [], - "tokenization_utils": ["PreTrainedTokenizer"], + "tokenization_python": ["PreTrainedTokenizer", "PythonBackend"], + "tokenization_utils_sentencepiece": ["SentencePieceBackend"], "tokenization_utils_base": [ "AddedToken", "BatchEncoding", "CharSpan", "PreTrainedTokenizerBase", - "SpecialTokensMixin", "TokenSpan", ], "trainer_callback": [ @@ -274,7 +275,10 @@ ] else: # Fast tokenizers structure - _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"] + _import_structure["tokenization_utils_tokenizers"] = [ + "TokenizersBackend", + "PreTrainedTokenizerFast", + ] try: @@ -302,7 +306,7 @@ name for name in dir(dummy_mistral_common_objects) if not name.startswith("_") ] else: - _import_structure["tokenization_mistral_common"] = ["MistralCommonTokenizer"] + _import_structure["tokenization_mistral_common"] = ["MistralCommonBackend"] # Vision-specific objects try: @@ -677,14 +681,20 @@ from .pytorch_utils import apply_chunking_to_forward as apply_chunking_to_forward # Tokenization - from .tokenization_utils import PreTrainedTokenizer as PreTrainedTokenizer + from .tokenization_python import PreTrainedTokenizer as PreTrainedTokenizer + from .tokenization_python import PythonBackend as PythonBackend from .tokenization_utils_base import AddedToken as AddedToken from .tokenization_utils_base import BatchEncoding as BatchEncoding from .tokenization_utils_base import CharSpan as CharSpan from .tokenization_utils_base import PreTrainedTokenizerBase as PreTrainedTokenizerBase - from .tokenization_utils_base import SpecialTokensMixin as SpecialTokensMixin from .tokenization_utils_base import TokenSpan as TokenSpan - from .tokenization_utils_fast import PreTrainedTokenizerFast as PreTrainedTokenizerFast + + # Tokenization + from .tokenization_utils_sentencepiece import SentencePieceBackend as SentencePieceBackend + from .tokenization_utils_tokenizers import PreTrainedTokenizerFast as PreTrainedTokenizerFast + from .tokenization_utils_tokenizers import ( + TokenizersBackend as TokenizersBackend, + ) # Trainer from .trainer import Trainer as Trainer diff --git a/src/transformers/cli/add_new_model_like.py b/src/transformers/cli/add_new_model_like.py index 8a0a5446c301..f6a7944ca38a 100644 --- a/src/transformers/cli/add_new_model_like.py +++ b/src/transformers/cli/add_new_model_like.py @@ -142,7 +142,7 @@ def __init__(self, lowercase_name: str): # Get tokenizer class if self.lowercase_name in TOKENIZER_MAPPING_NAMES: - self.tokenizer_class, self.fast_tokenizer_class = TOKENIZER_MAPPING_NAMES[self.lowercase_name] + self.fast_tokenizer_class = TOKENIZER_MAPPING_NAMES[self.lowercase_name] self.fast_tokenizer_class = ( None if self.fast_tokenizer_class == "PreTrainedTokenizerFast" else self.fast_tokenizer_class ) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 8bafa5d017b0..26bc4eb1f285 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1590,7 +1590,6 @@ def tokenizer(self, proto): return tokenizer -# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control @@ -1625,16 +1624,14 @@ def __init__( vocab_file=None, pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", add_prefix_space=False, - additional_special_tokens=None, + extra_special_tokens=None, **kwargs, ): self.vocab_file = vocab_file self.pattern = pattern self.add_prefix_space = add_prefix_space - self.additional_special_tokens = ( - additional_special_tokens.keys() - if isinstance(additional_special_tokens, dict) - else additional_special_tokens + self.extra_special_tokens = ( + extra_special_tokens.keys() if isinstance(extra_special_tokens, dict) else extra_special_tokens ) def extract_vocab_merges_from_model(self, tiktoken_url: str): @@ -1686,7 +1683,7 @@ def converted(self) -> Tokenizer: tokenizer.decoder = decoders.ByteLevel() tokenizer.add_special_tokens( - [AddedToken(token, normalized=False, special=True) for token in self.additional_special_tokens] + [AddedToken(token, normalized=False, special=True) for token in self.extra_special_tokens] ) tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) @@ -1861,7 +1858,7 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni logger.info("Converting from Tiktoken") return TikTokenConverter( vocab_file=transformer_tokenizer.vocab_file, - additional_special_tokens=transformer_tokenizer.additional_special_tokens, + extra_special_tokens=transformer_tokenizer.extra_special_tokens, ).converted() except Exception: raise ValueError( diff --git a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py index 16badf48b2f9..46886e6fc039 100755 --- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py +++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py @@ -28,11 +28,21 @@ logger = logging.get_logger(__name__) -TOKENIZER_CLASSES = { - # Phi3 uses Llama tokenizer - name: getattr(transformers, "LlamaTokenizerFast" if name == "Phi3Tokenizer" else name + "Fast") - for name in SLOW_TO_FAST_CONVERTERS -} +TOKENIZER_CLASSES = {} +for name in SLOW_TO_FAST_CONVERTERS: + # Special cases for tokenizers that don't have their own Fast tokenizer + if name == "Phi3Tokenizer": + tokenizer_class_name = "LlamaTokenizerFast" + elif name == "ElectraTokenizer": + tokenizer_class_name = "BertTokenizerFast" + else: + tokenizer_class_name = name + "Fast" + + try: + TOKENIZER_CLASSES[name] = getattr(transformers, tokenizer_class_name) + except AttributeError: + # Skip tokenizers that don't have a Fast version + pass def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download): diff --git a/src/transformers/data/datasets/glue.py b/src/transformers/data/datasets/glue.py index 51058a696101..644c582b5ed1 100644 --- a/src/transformers/data/datasets/glue.py +++ b/src/transformers/data/datasets/glue.py @@ -102,7 +102,6 @@ def __init__( label_list = self.processor.get_labels() if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__.__name__ in ( "RobertaTokenizer", - "RobertaTokenizerFast", "XLMRobertaTokenizer", "BartTokenizer", "BartTokenizerFast", diff --git a/src/transformers/data/datasets/squad.py b/src/transformers/data/datasets/squad.py index 8eb9bf4dfaab..ba8936a5d642 100644 --- a/src/transformers/data/datasets/squad.py +++ b/src/transformers/data/datasets/squad.py @@ -22,7 +22,7 @@ from torch.utils.data import Dataset from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_python import PreTrainedTokenizer from ...utils import check_torch_load_is_safe, logging from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py index 8ed669aed831..c6865f2d99a5 100644 --- a/src/transformers/data/processors/glue.py +++ b/src/transformers/data/processors/glue.py @@ -19,7 +19,7 @@ import warnings from enum import Enum -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_python import PreTrainedTokenizer from ...utils import logging from .utils import DataProcessor, InputExample, InputFeatures diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py index e3064b9d2134..bc40f6b3ad39 100644 --- a/src/transformers/data/processors/squad.py +++ b/src/transformers/data/processors/squad.py @@ -21,7 +21,7 @@ import numpy as np from tqdm import tqdm -from ...models.bert.tokenization_bert import whitespace_tokenize +from ...models.bert.tokenization_bert_legacy import whitespace_tokenize from ...tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase, TruncationStrategy from ...utils import is_torch_available, is_torch_hpu_available, logging from .utils import DataProcessor @@ -125,7 +125,6 @@ def squad_convert_example_to_features( "RobertaTokenizer", "LongformerTokenizer", "BartTokenizer", - "RobertaTokenizerFast", "LongformerTokenizerFast", "BartTokenizerFast", ]: @@ -161,7 +160,8 @@ def squad_convert_example_to_features( if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET else tokenizer.model_max_length - tokenizer.max_len_single_sentence ) - sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair + max_len_sentences_pair = tokenizer.model_max_length - tokenizer.num_special_tokens_to_add(pair=True) + sequence_pair_added_tokens = tokenizer.model_max_length - max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): @@ -175,7 +175,7 @@ def squad_convert_example_to_features( pairs = truncated_query truncation = TruncationStrategy.ONLY_FIRST.value - encoded_dict = tokenizer.encode_plus( # TODO(thom) update this logic + encoded_dict = tokenizer( # TODO(thom) update this logic texts, pairs, truncation=truncation, diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 652639af235c..c695858169e9 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -490,7 +490,7 @@ def convert_source_tokens_to_target_tokens( Returns: The converted token IDs. """ - text = source_tokenizer.batch_decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + text = source_tokenizer.decode(input_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) dest_ids = destination_tokenizer(text, add_special_tokens=True, return_tensors="pt")["input_ids"] return dest_ids.to(input_ids.device) @@ -978,7 +978,7 @@ def _prepare_assistant_input_ids(self, target_input_ids: torch.LongTensor) -> to # we have only one new token and we can directly convert it assistant_new_ids = self._atm_translator.target_to_assistant_input_ids.get(target_new_ids[0].item()) if assistant_new_ids is None: - target_new_text = self.target_tokenizer.batch_decode( + target_new_text = self.target_tokenizer.decode( target_new_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True ) assistant_new_ids = self.assistant_tokenizer( diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index b64f455178b7..f97828a0862b 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -44,7 +44,7 @@ from ..integrations.fsdp import is_fsdp_managed_module from ..masking_utils import create_masks_for_generate from ..pytorch_utils import isin_mps_friendly -from ..tokenization_utils import ExtensionsTrie +from ..tokenization_python import ExtensionsTrie from ..utils import ( ModelOutput, TransformersKwargs, @@ -2734,7 +2734,7 @@ def heal_tokens( # assumption: leading/trailing whitespace is not meaningful, so the prompts are # stripped before re-tokenizing to desensitize generation to whitespace artefacts - prompts = [p.strip() for p in tokenizer.batch_decode(input_ids, skip_special_tokens=True)] + prompts = [p.strip() for p in tokenizer.decode(input_ids, skip_special_tokens=True)] input_ids = tokenizer( prompts, return_tensors="pt", diff --git a/src/transformers/integrations/mistral.py b/src/transformers/integrations/mistral.py index cdf237645fc1..68807f20425b 100644 --- a/src/transformers/integrations/mistral.py +++ b/src/transformers/integrations/mistral.py @@ -1,8 +1,8 @@ from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors from tokenizers.models import BPE -from transformers import LlamaTokenizerFast from transformers.convert_slow_tokenizer import bytes_to_unicode +from transformers.tokenization_utils_tokenizers import PreTrainedTokenizerFast class MistralConverter: @@ -85,7 +85,9 @@ def convert_tekken_tokenizer(tokenizer_file: str): # Extract vocab and special tokens vocab = mistral_tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial all_special = [ - token.value if hasattr(token, "value") else token + token.get("token_str", str(token)) + if isinstance(token, dict) + else (token.value if hasattr(token, "value") else str(token)) for token in mistral_tokenizer.instruct_tokenizer.tokenizer._all_special_tokens ] specials_tokens = {token: all_special.index(token) for token in all_special} @@ -93,8 +95,8 @@ def convert_tekken_tokenizer(tokenizer_file: str): vocab = specials_tokens # Convert - tokenizer = LlamaTokenizerFast( - tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(), + tokenizer = PreTrainedTokenizerFast( + tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted() ) # Post-process diff --git a/src/transformers/integrations/tiktoken.py b/src/transformers/integrations/tiktoken.py index 6f200bc35593..9dd19052d29a 100644 --- a/src/transformers/integrations/tiktoken.py +++ b/src/transformers/integrations/tiktoken.py @@ -2,7 +2,7 @@ from typing import Any from transformers.convert_slow_tokenizer import TikTokenConverter -from transformers.tokenization_utils_fast import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE +from transformers.tokenization_utils_tokenizers import TIKTOKEN_VOCAB_FILE, TOKENIZER_FILE def convert_tiktoken_to_fast(encoding: Any, output_dir: str): @@ -23,6 +23,9 @@ def convert_tiktoken_to_fast(encoding: Any, output_dir: str): save_file = output_dir / "tiktoken" / TIKTOKEN_VOCAB_FILE tokenizer_file = output_dir / TOKENIZER_FILE + # Create parent directory for save_file + save_file.parent.mkdir(parents=True, exist_ok=True) + save_file_absolute = str(save_file.absolute()) output_file_absolute = str(tokenizer_file.absolute()) @@ -34,10 +37,17 @@ def convert_tiktoken_to_fast(encoding: Any, output_dir: str): encoding = get_encoding(encoding) dump_tiktoken_bpe(encoding._mergeable_ranks, save_file_absolute) - except ImportError: - raise ValueError("`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`.") + except ImportError as e: + error_msg = str(e) + if "blobfile" in error_msg.lower(): + raise ValueError( + "`blobfile` is required to save a `tiktoken` file. Install it with `pip install blobfile`." + ) from e + raise ValueError( + "`tiktoken` is required to save a `tiktoken` file. Install it with `pip install tiktoken`." + ) from e tokenizer = TikTokenConverter( - vocab_file=save_file_absolute, pattern=encoding._pat_str, additional_special_tokens=encoding._special_tokens + vocab_file=save_file_absolute, pattern=encoding._pat_str, extra_special_tokens=encoding._special_tokens ).converted() tokenizer.save(output_file_absolute) diff --git a/src/transformers/models/albert/__init__.py b/src/transformers/models/albert/__init__.py index ac2cf362ebf2..606fc7dcc0f8 100644 --- a/src/transformers/models/albert/__init__.py +++ b/src/transformers/models/albert/__init__.py @@ -21,7 +21,6 @@ from .configuration_albert import * from .modeling_albert import * from .tokenization_albert import * - from .tokenization_albert_fast import * else: import sys diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 011ad689edbd..479efa963548 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -14,41 +14,30 @@ # limitations under the License. """Tokenization classes for ALBERT model.""" -import os -import unicodedata -from shutil import copyfile -from typing import Any, Optional +from typing import Optional -import sentencepiece as spm +from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import Unigram -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging -from ...utils.import_utils import requires logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} -SPIECE_UNDERLINE = "▁" - -@requires(backends=("sentencepiece",)) -class AlbertTokenizer(PreTrainedTokenizer): +class AlbertTokenizer(TokenizersBackend): """ - Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). - - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. + Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. do_lower_case (`bool`, *optional*, defaults to `True`): Whether or not to lowercase the input when tokenizing. - remove_space (`bool`, *optional*, defaults to `True`): - Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). keep_accents (`bool`, *optional*, defaults to `False`): Whether or not to keep accents when tokenizing. bos_token (`str`, *optional*, defaults to `"[CLS]"`): @@ -62,15 +51,8 @@ class AlbertTokenizer(PreTrainedTokenizer): eos_token (`str`, *optional*, defaults to `"[SEP]"`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. unk_token (`str`, *optional*, defaults to `""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. @@ -86,235 +68,118 @@ class AlbertTokenizer(PreTrainedTokenizer): mask_token (`str`, *optional*, defaults to `"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - sp_model_kwargs (`dict`, *optional*): - Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for - SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, - to set: - - - `enable_sampling`: Enable subword regularization. - - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - - - `nbest_size = {0,1}`: No sampling is performed. - - `nbest_size > 1`: samples from the nbest_size results. - - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - using forward-filtering-and-backward-sampling algorithm. - - - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - BPE-dropout. - - Attributes: - sp_model (`SentencePieceProcessor`): - The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether the post processing step should trim offsets to avoid including whitespaces. + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. + vocab_file (`str`, *optional*): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that + contains the vocabulary necessary to instantiate a tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None def __init__( self, - vocab_file, - do_lower_case=True, - remove_space=True, - keep_accents=False, - bos_token="[CLS]", - eos_token="[SEP]", - unk_token="", - sep_token="[SEP]", - pad_token="", - cls_token="[CLS]", - mask_token="[MASK]", - sp_model_kwargs: Optional[dict[str, Any]] = None, + do_lower_case: bool = True, + keep_accents: bool = False, + bos_token: str = "[CLS]", + eos_token: str = "[SEP]", + unk_token: str = "", + sep_token: str = "[SEP]", + pad_token: str = "", + cls_token: str = "[CLS]", + mask_token: str = "[MASK]", + add_prefix_space: bool = True, + trim_offsets: bool = True, + vocab: Optional[dict] = None, + vocab_file: Optional[str] = None, **kwargs, - ) -> None: - # Mask token behave like a normal word, i.e. include the space before it and - # is included in the raw text, there should be a match in a non-normalized sentence. - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) - if isinstance(mask_token, str) - else mask_token - ) - - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + ): + self.vocab_file = vocab_file + self.add_prefix_space = add_prefix_space + self.trim_offsets = trim_offsets self.do_lower_case = do_lower_case - self.remove_space = remove_space self.keep_accents = keep_accents - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - - super().__init__( - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, - **kwargs, - ) - - @property - def vocab_size(self) -> int: - return len(self.sp_model) - def get_vocab(self) -> dict[str, int]: - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def preprocess_text(self, inputs): - if self.remove_space: - outputs = " ".join(inputs.strip().split()) + if vocab is not None: + self._vocab_scores = [(token, 0.0) for token in vocab.keys()] if isinstance(vocab, dict) else list(vocab) else: - outputs = inputs - outputs = outputs.replace("``", '"').replace("''", '"') + self._vocab_scores = [ + (str(pad_token), 0.0), + (str(unk_token), 0.0), + (str(cls_token), 0.0), + (str(sep_token), 0.0), + (str(mask_token), 0.0), + ] + + self._tokenizer = Tokenizer( + Unigram( + self._vocab_scores, + unk_id=1, + byte_fallback=False, + ) + ) + list_normalizers = [ + normalizers.Replace("``", '"'), + normalizers.Replace("''", '"'), + normalizers.NFKD(), + normalizers.StripAccents(), + normalizers.Lowercase(), + normalizers.Replace(Regex(" {2,}"), " "), + ] if not self.keep_accents: - outputs = unicodedata.normalize("NFKD", outputs) - outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + list_normalizers.append(normalizers.NFKD()) + list_normalizers.append(normalizers.StripAccents()) if self.do_lower_case: - outputs = outputs.lower() - - return outputs - - def _tokenize(self, text: str) -> list[str]: - """Tokenize a string.""" - text = self.preprocess_text(text) - pieces = self.sp_model.encode(text, out_type=str) - new_pieces = [] - for piece in pieces: - if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit(): - # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization - # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9'] - cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) - if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: - if len(cur_pieces[0]) == 1: - cur_pieces = cur_pieces[1:] - else: - cur_pieces[0] = cur_pieces[0][1:] - cur_pieces.append(piece[-1]) - new_pieces.extend(cur_pieces) - else: - new_pieces.append(piece) - - return new_pieces - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.sp_model.IdToPiece(index) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An ALBERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. + list_normalizers.append(normalizers.Lowercase()) - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep + list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " ")) + self._tokenizer.normalizer = normalizers.Sequence(list_normalizers) - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) + prepend_scheme = "always" if add_prefix_space else "never" + self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.WhitespaceSplit(), + pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme), + ] + ) - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] + self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme) - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + self._tokenizer.post_processor = processors.TemplateProcessing( + single="[CLS]:0 $A:0 [SEP]:0", + pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1", + special_tokens=[ + ("[CLS]", self._tokenizer.token_to_id(str(cls_token))), + ("[SEP]", self._tokenizer.token_to_id(str(sep_token))), + ], ) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) + tokenizer_object = self._tokenizer - return (out_vocab_file,) + super().__init__( + tokenizer_object=tokenizer_object, + do_lower_case=self.do_lower_case, + keep_accents=self.keep_accents, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + **kwargs, + ) __all__ = ["AlbertTokenizer"] diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py deleted file mode 100644 index ed9add51d207..000000000000 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ /dev/null @@ -1,178 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for ALBERT model.""" - -import os -from shutil import copyfile -from typing import Optional - -from ...tokenization_utils import AddedToken -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import is_sentencepiece_available, logging - - -if is_sentencepiece_available(): - from .tokenization_albert import AlbertTokenizer -else: - AlbertTokenizer = None - -logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} - - -SPIECE_UNDERLINE = "▁" - - -class AlbertTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on - [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This - tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods - - Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. - do_lower_case (`bool`, *optional*, defaults to `True`): - Whether or not to lowercase the input when tokenizing. - remove_space (`bool`, *optional*, defaults to `True`): - Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). - keep_accents (`bool`, *optional*, defaults to `False`): - Whether or not to keep accents when tokenizing. - bos_token (`str`, *optional*, defaults to `"[CLS]"`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `"[SEP]"`): - The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token - that is used for the end of sequence. The token used is the `sep_token`. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (`str`, *optional*, defaults to `"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (`str`, *optional*, defaults to `"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - mask_token (`str`, *optional*, defaults to `"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - """ - - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = AlbertTokenizer - - def __init__( - self, - vocab_file=None, - tokenizer_file=None, - do_lower_case=True, - remove_space=True, - keep_accents=False, - bos_token="[CLS]", - eos_token="[SEP]", - unk_token="", - sep_token="[SEP]", - pad_token="", - cls_token="[CLS]", - mask_token="[MASK]", - **kwargs, - ): - # Mask token behave like a normal word, i.e. include the space before it and - # is included in the raw text, there should be a match in a non-normalized sentence. - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) - if isinstance(mask_token, str) - else mask_token - ) - - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - do_lower_case=do_lower_case, - remove_space=remove_space, - keep_accents=keep_accents, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs, - ) - - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An ALBERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " - "tokenizer." - ) - - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -__all__ = ["AlbertTokenizerFast"] diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index d2e15336832f..1001e647803b 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -40,7 +40,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_utils import PreTrainedModel from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack -from ...tokenization_utils import PreTokenizedInput, TextInput +from ...tokenization_python import PreTokenizedInput, TextInput from ...utils import TensorType, TransformersKwargs, auto_docstring, can_return_tuple, logging from ..auto import CONFIG_MAPPING, AutoConfig, AutoTokenizer from ..llama.configuration_llama import LlamaConfig diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index c29c289649da..23f13f3129c2 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -25,7 +25,7 @@ from ...image_processing_utils import BatchFeature from ...image_utils import ImageInput from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack -from ...tokenization_utils import PreTokenizedInput, TextInput +from ...tokenization_python import PreTokenizedInput, TextInput from ...utils import TensorType from ..auto import AutoTokenizer diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index d89faee4c13a..93d8dfabdd7e 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -25,7 +25,7 @@ from ...feature_extraction_utils import FeatureExtractionMixin from ...image_processing_utils import ImageProcessingMixin from ...processing_utils import ProcessorMixin -from ...tokenization_utils import TOKENIZER_CONFIG_FILE +from ...tokenization_python import TOKENIZER_CONFIG_FILE from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, VIDEO_PROCESSOR_NAME, cached_file, logging from ...video_processing_utils import BaseVideoProcessor from .auto_factory import _LazyAutoMapping diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 2511bb8f24e0..deac13558652 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -15,6 +15,7 @@ """Auto Tokenizer class.""" import importlib +import inspect import json import os from collections import OrderedDict @@ -25,16 +26,16 @@ from ...configuration_utils import PreTrainedConfig from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint -from ...tokenization_utils import PreTrainedTokenizer -from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE +from ...tokenization_python import PreTrainedTokenizer, PythonBackend +from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE, find_sentencepiece_model_file, load_vocab_and_merges from ...utils import ( - cached_file, extract_commit_hash, is_g2p_en_available, is_sentencepiece_available, is_tokenizers_available, logging, ) +from ...utils.hub import cached_file, has_file from ..encoder_decoder import EncoderDecoderConfig from .auto_factory import _LazyAutoMapping from .configuration_auto import ( @@ -47,777 +48,303 @@ if is_tokenizers_available(): - from ...tokenization_utils_fast import PreTrainedTokenizerFast + from ...tokenization_utils_tokenizers import TokenizersBackend else: - PreTrainedTokenizerFast = None + TokenizersBackend = None +if is_sentencepiece_available(): + from ...tokenization_utils_sentencepiece import SentencePieceBackend +else: + SentencePieceBackend = None logger = logging.get_logger(__name__) -# Explicit rather than inferred generics to significantly improves completion suggestion performance for language servers. -TOKENIZER_MAPPING_NAMES = OrderedDict[str, tuple[Optional[str], Optional[str]]]( +# V5: Simplified mapping - single tokenizer class per model type (always prefer tokenizers-based) +REGISTERED_TOKENIZER_CLASSES: dict[str, type[Any]] = {} +REGISTERED_FAST_ALIASES: dict[str, type[Any]] = {} + +TOKENIZER_MAPPING_NAMES = OrderedDict[str, Optional[str]]( [ - ( - "aimv2", - ( - "CLIPTokenizer", - "CLIPTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "albert", - ( - "AlbertTokenizer" if is_sentencepiece_available() else None, - "AlbertTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("altclip", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), - ("arcee", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), - ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("bart", ("BartTokenizer", "BartTokenizerFast")), - ( - "barthez", - ( - "BarthezTokenizer" if is_sentencepiece_available() else None, - "BarthezTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("bartpho", ("BartphoTokenizer", None)), - ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), - ("bert-japanese", ("BertJapaneseTokenizer", None)), - ("bertweet", ("BertweetTokenizer", None)), - ( - "big_bird", - ( - "BigBirdTokenizer" if is_sentencepiece_available() else None, - "BigBirdTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)), - ("biogpt", ("BioGptTokenizer", None)), - ("bitnet", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), - ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), - ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)), - ("blt", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), - ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("byt5", ("ByT5Tokenizer", None)), - ( - "camembert", - ( - "CamembertTokenizer" if is_sentencepiece_available() else None, - "CamembertTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("canine", ("CanineTokenizer", None)), - ( - "chameleon", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ( - "clap", - ( - "RobertaTokenizer", - "RobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "clip", - ( - "CLIPTokenizer", - "CLIPTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "clipseg", - ( - "CLIPTokenizer", - "CLIPTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("clvp", ("ClvpTokenizer", None)), - ( - "code_llama", - ( - "CodeLlamaTokenizer" if is_sentencepiece_available() else None, - "CodeLlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), - ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), - ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), - ("cohere2_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), - ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("colqwen2", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), - ( - "cpm", - ( - "CpmTokenizer" if is_sentencepiece_available() else None, - "CpmTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("cpmant", ("CpmAntTokenizer", None)), - ("csm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("ctrl", ("CTRLTokenizer", None)), - ( - "cwm", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)), - ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), - ("dbrx", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), - ( - "deberta-v2", - ( - "DebertaV2Tokenizer" if is_sentencepiece_available() else None, - "DebertaV2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "deepseek_v2", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "deepseek_v3", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "deepseek_vl", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "deepseek_vl_hybrid", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("dia", ("DiaTokenizer", None)), - ( - "diffllama", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), - ("donut", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), - ( - "dpr", - ( - "DPRQuestionEncoderTokenizer", - "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), - ("emu3", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("ernie4_5", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("ernie4_5_moe", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("esm", ("EsmTokenizer", None)), - ("evolla", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ( - "exaone4", - ( - "GPT2Tokenizer" if is_tokenizers_available() else None, - "GPT2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("falcon", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("falcon_mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ( - "fastspeech2_conformer", - ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), - ), - ("flaubert", ("FlaubertTokenizer", None)), - ("flava", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("flex_olmo", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("florence2", ("BartTokenizer", "BartTokenizerFast" if is_tokenizers_available() else None)), - ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), - ("fsmt", ("FSMTTokenizer", None)), - ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), - ("fuyu", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ( - "gemma", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma2", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma3", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma3_text", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma3n", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma3n_text", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("glm46v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("glm4_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("glm4v", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("glm4v_moe", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("got_ocr2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), - ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), - ("gpt_oss", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("granite", ("GPT2Tokenizer", None)), - ("granite_speech", ("GPT2Tokenizer", None)), - ("granitemoe", ("GPT2Tokenizer", None)), - ("granitemoehybrid", ("GPT2Tokenizer", None)), - ("granitemoeshared", ("GPT2Tokenizer", None)), - ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), - ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), - ("hubert", ("Wav2Vec2CTCTokenizer", None)), - ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), - ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("internvl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ( - "jamba", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("janus", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ( - "jetmoe", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "kosmos-2", - ( - "XLMRobertaTokenizer" if is_sentencepiece_available() else None, - "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("kosmos-2.5", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("kyutai_speech_to_text", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), - ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), - ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), - ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), - ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), - ("lfm2", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("lfm2_vl", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), - ( - "llama", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "llama4", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "llama4_text", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("llava_onevision", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), - ( - "longt5", - ( - "T5Tokenizer" if is_sentencepiece_available() else None, - "T5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("luke", ("LukeTokenizer", None)), - ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), - ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), - ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), - ("markuplm", ("MarkupLMTokenizer", "MarkupLMTokenizerFast" if is_tokenizers_available() else None)), - ( - "mbart", - ( - "MBartTokenizer" if is_sentencepiece_available() else None, - "MBartTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "mbart50", - ( - "MBart50Tokenizer" if is_sentencepiece_available() else None, - "MBart50TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ( - "metaclip_2", - ( - "XLMRobertaTokenizer", - "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("mgp-str", ("MgpstrTokenizer", None)), - ( - "minimax", - ( - "GPT2Tokenizer" if is_sentencepiece_available() else None, - "GPT2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "ministral", - ( - "MistralCommonTokenizer" - if is_mistral_common_available() - else ("LlamaTokenizer" if is_sentencepiece_available() else None), - "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, - ), - ), + ("aimv2", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("albert", "AlbertTokenizer" if is_tokenizers_available() else None), + ("align", "BertTokenizer" if is_tokenizers_available() else None), + ("arcee", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("aria", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("aya_vision", "CohereTokenizer" if is_tokenizers_available() else None), + ("bark", "BertTokenizer" if is_tokenizers_available() else None), + ("bart", "RobertaTokenizer" if is_tokenizers_available() else None), + ("barthez", "BarthezTokenizer" if is_tokenizers_available() else None), + ("bartpho", "BartphoTokenizer"), + ("bert", "BertTokenizer" if is_tokenizers_available() else None), + ("bert-generation", "BertGenerationTokenizer" if is_sentencepiece_available() else None), + ("bert-japanese", "BertJapaneseTokenizer"), + ("bertweet", "BertweetTokenizer"), + ("big_bird", "BigBirdTokenizer" if is_tokenizers_available() else None), + ("bigbird_pegasus", "PegasusTokenizer" if is_tokenizers_available() else None), + ("biogpt", "BioGptTokenizer"), + ("bitnet", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("blenderbot", "BlenderbotTokenizer" if is_tokenizers_available() else None), + ("blenderbot-small", "BlenderbotSmallTokenizer"), + ("blip", "BertTokenizer" if is_tokenizers_available() else None), + ("blip-2", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("bloom", "TokenizersBackend" if is_tokenizers_available() else None), + ("blt", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("bridgetower", "RobertaTokenizer"), + ("bros", "BertTokenizer" if is_tokenizers_available() else None), + ("byt5", "ByT5Tokenizer"), + ("camembert", "CamembertTokenizer" if is_tokenizers_available() else None), + ("canine", "CanineTokenizer"), + ("chameleon", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("chinese_clip", "BertTokenizer" if is_tokenizers_available() else None), + ("clap", "RobertaTokenizer"), + ("clip", "CLIPTokenizer" if is_tokenizers_available() else None), + ("clipseg", "CLIPTokenizer" if is_tokenizers_available() else None), + ("clvp", "ClvpTokenizer"), + ("code_llama", "CodeLlamaTokenizer" if is_tokenizers_available() else None), + ("codegen", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("cohere", "CohereTokenizer" if is_tokenizers_available() else None), + ("cohere2", "CohereTokenizer" if is_tokenizers_available() else None), + ("colpali", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("colqwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("convbert", "BertTokenizer" if is_tokenizers_available() else None), + ("cpm", "CpmTokenizer" if is_tokenizers_available() else None), + ("cpmant", "CpmAntTokenizer"), + ("csm", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("ctrl", "CTRLTokenizer"), + ("data2vec-audio", "Wav2Vec2CTCTokenizer"), + ("data2vec-text", "RobertaTokenizer"), + ("dbrx", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("deberta", "DebertaTokenizer" if is_tokenizers_available() else None), + ("deberta-v2", "DebertaV2Tokenizer" if is_tokenizers_available() else None), + ("deepseek_v2", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("deepseek_v3", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("deepseek_vl", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("deepseek_vl_hybrid", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("dia", "DiaTokenizer"), + ("diffllama", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("distilbert", "BertTokenizer" if is_tokenizers_available() else None), + ("dpr", "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None), + ("electra", "BertTokenizer" if is_tokenizers_available() else None), + ("emu3", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("ernie", "BertTokenizer" if is_tokenizers_available() else None), + ("ernie4_5", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("ernie4_5_moe", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("esm", "EsmTokenizer"), + ("exaone4", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("falcon", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("falcon_mamba", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("fastspeech2_conformer", "FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None), + ("flaubert", "FlaubertTokenizer"), + ("flava", "BertTokenizer" if is_tokenizers_available() else None), + ("flex_olmo", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("florence2", "BartTokenizer" if is_tokenizers_available() else None), + ("fnet", "FNetTokenizerFast" if is_tokenizers_available() else None), + ("fsmt", "FSMTTokenizer"), + ("funnel", "FunnelTokenizer" if is_tokenizers_available() else None), + ("gemma", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("gemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("gemma3", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("gemma3_text", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("gemma3n", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("gemma3n_text", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("git", "BertTokenizer" if is_tokenizers_available() else None), + ("glm", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("glm4", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("glm4_moe", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("glm4v", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("glm4v_moe", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("got_ocr2", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("gpt-sw3", "GPTSw3Tokenizer" if is_sentencepiece_available() else None), + ("gpt2", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("gpt_bigcode", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("gpt_neo", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("gpt_neox", "GPTNeoXTokenizer" if is_tokenizers_available() else None), + ("gpt_neox_japanese", "GPTNeoXJapaneseTokenizer"), + ("gpt_oss", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("gptj", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("granite", "GPT2Tokenizer"), + ("granitemoe", "GPT2Tokenizer"), + ("granitemoehybrid", "GPT2Tokenizer"), + ("granitemoeshared", "GPT2Tokenizer"), + ("grounding-dino", "BertTokenizer" if is_tokenizers_available() else None), + ("groupvit", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("helium", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("herbert", "HerbertTokenizer" if is_tokenizers_available() else None), + ("hubert", "Wav2Vec2CTCTokenizer"), + ("ibert", "RobertaTokenizer"), + ("idefics", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("idefics2", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("idefics3", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("instructblip", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("instructblipvideo", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("internvl", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("jamba", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("janus", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("jetmoe", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("kosmos-2", "XLMRobertaTokenizer" if is_tokenizers_available() else None), + ("kosmos-2.5", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("layoutlm", "BertTokenizer" if is_tokenizers_available() else None), + ("layoutlmv2", "LayoutLMv2Tokenizer" if is_tokenizers_available() else None), + ("layoutlmv3", "LayoutLMv3Tokenizer" if is_tokenizers_available() else None), + ("layoutxlm", "LayoutXLMTokenizer" if is_tokenizers_available() else None), + ("led", "LEDTokenizer" if is_tokenizers_available() else None), + ("lfm2_vl", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("lilt", "RobertaTokenizer" if is_tokenizers_available() else None), + ("llama", "LlamaTokenizer" if is_tokenizers_available() else None), + ("llama4", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("llama4_text", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("llava", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("llava_next", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("llava_next_video", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("llava_onevision", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("longformer", "RobertaTokenizer" if is_tokenizers_available() else None), + ("longt5", "T5Tokenizer" if is_tokenizers_available() else None), + ("luke", "LukeTokenizer"), + ("lxmert", "LxmertTokenizer" if is_tokenizers_available() else None), + ("m2m_100", "M2M100Tokenizer" if is_sentencepiece_available() else None), + ("mamba", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("mamba2", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("marian", "MarianTokenizer" if is_sentencepiece_available() else None), + ("mbart", "MBartTokenizer" if is_tokenizers_available() else None), + ("mbart50", "MBart50Tokenizer" if is_tokenizers_available() else None), + ("mega", "RobertaTokenizer"), + ("megatron-bert", "BertTokenizer" if is_tokenizers_available() else None), + ("metaclip_2", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None), + ("mgp-str", "MgpstrTokenizer"), + ("minimax", "GPT2Tokenizer" if is_tokenizers_available() else None), ( "mistral", - ( - "MistralCommonTokenizer" - if is_mistral_common_available() - else ("LlamaTokenizer" if is_sentencepiece_available() else None), - "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, - ), - ), - ( - "mistral3", - ( - "MistralCommonTokenizer" - if is_mistral_common_available() - else ("LlamaTokenizer" if is_sentencepiece_available() else None), - "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, - ), + "MistralCommonBackend" + if is_mistral_common_available() + else ("LlamaTokenizerFast" if is_tokenizers_available() else None), ), ( "mixtral", - ( - "MistralCommonTokenizer" - if is_mistral_common_available() - else ("LlamaTokenizer" if is_sentencepiece_available() else None), - "LlamaTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, - ), - ), - ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)), - ("mm-grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), - ("modernbert", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), - ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), - ( - "mt5", - ( - "MT5Tokenizer" if is_sentencepiece_available() else None, - "MT5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), - ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), - ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)), - ("myt5", ("MyT5Tokenizer", None)), - ("nanochat", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ( - "nllb", - ( - "NllbTokenizer" if is_sentencepiece_available() else None, - "NllbTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "nllb-moe", - ( - "NllbTokenizer" if is_sentencepiece_available() else None, - "NllbTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("nougat", (None, "NougatTokenizerFast" if is_tokenizers_available() else None)), - ( - "nystromformer", - ( - "AlbertTokenizer" if is_sentencepiece_available() else None, - "AlbertTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("olmo3", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ( - "omdet-turbo", - ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None), - ), - ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), - ( - "openai-gpt", - ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None), - ), - ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("ovis2", (None, "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), - ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), - ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("parakeet", (None, "ParakeetTokenizerFast" if is_tokenizers_available() else None)), - ( - "pegasus", - ( - "PegasusTokenizer" if is_sentencepiece_available() else None, - "PegasusTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "pegasus_x", - ( - "PegasusTokenizer" if is_sentencepiece_available() else None, - "PegasusTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "perceiver", - ( - "PerceiverTokenizer", - None, - ), - ), - ("perception_lm", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ( - "persimmon", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), - ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("phi4_multimodal", (None, "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("phobert", ("PhobertTokenizer", None)), - ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), + "MistralCommonBackend" + if is_mistral_common_available() + else ("LlamaTokenizerFast" if is_tokenizers_available() else None), + ), + ("mllama", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("mluke", "MLukeTokenizer" if is_sentencepiece_available() else None), + ("mm-grounding-dino", "BertTokenizer" if is_tokenizers_available() else None), + ("mobilebert", "MobileBertTokenizer" if is_tokenizers_available() else None), + ("modernbert", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("moonshine", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("moshi", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("mpnet", "MPNetTokenizer" if is_tokenizers_available() else None), + ("mpt", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("mra", "RobertaTokenizer"), + ("mt5", "T5Tokenizer" if is_tokenizers_available() else None), + ("musicgen", "T5Tokenizer" if is_tokenizers_available() else None), + ("musicgen_melody", "T5Tokenizer" if is_tokenizers_available() else None), + ("mvp", "MvpTokenizer" if is_tokenizers_available() else None), + ("myt5", "MyT5Tokenizer"), + ("nemotron", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("nezha", "BertTokenizer" if is_tokenizers_available() else None), + ("nllb", "NllbTokenizer" if is_tokenizers_available() else None), + ("nllb-moe", "NllbTokenizer" if is_tokenizers_available() else None), + ("nougat", "NougatTokenizer" if is_tokenizers_available() else None), + ("nystromformer", "AlbertTokenizerFast" if is_tokenizers_available() else None), + ("olmo", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("olmo2", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("olmo3", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("olmoe", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("omdet-turbo", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("oneformer", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("openai-gpt", "OpenAIGPTTokenizer" if is_tokenizers_available() else None), + ("opt", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("ovis2", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("owlv2", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("owlvit", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("paligemma", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("pegasus", "PegasusTokenizer" if is_tokenizers_available() else None), + ("pegasus_x", "PegasusTokenizer" if is_tokenizers_available() else None), + ("perceiver", "PerceiverTokenizer"), + ("persimmon", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("phi", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("phi3", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("phimoe", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("phobert", "PhobertTokenizer"), + ("pix2struct", "T5Tokenizer" if is_tokenizers_available() else None), ( "pixtral", - ( - None, - "MistralCommonTokenizer" - if is_mistral_common_available() - else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None), - ), - ), - ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), - ("pop2piano", ("Pop2PianoTokenizer", None)), - ("prophetnet", ("ProphetNetTokenizer", None)), - ( - "qwen2", - ( - "Qwen2Tokenizer", - "Qwen2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("qwen2_5_omni", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("qwen2_5_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("qwen2_audio", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ( - "qwen2_moe", - ( - "Qwen2Tokenizer", - "Qwen2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ( - "qwen3", - ( - "Qwen2Tokenizer", - "Qwen2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "qwen3_moe", - ( - "Qwen2Tokenizer", - "Qwen2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "qwen3_next", - ( - "Qwen2Tokenizer", - "Qwen2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("qwen3_omni_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("qwen3_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("qwen3_vl_moe", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("rag", ("RagTokenizer", None)), - ( - "recurrent_gemma", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "reformer", - ( - "ReformerTokenizer" if is_sentencepiece_available() else None, - "ReformerTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "rembert", - ( - "RemBertTokenizer" if is_sentencepiece_available() else None, - "RemBertTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), - ( - "roberta-prelayernorm", - ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None), - ), - ("roc_bert", ("RoCBertTokenizer", None)), - ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), - ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ( - "seamless_m4t", - ( - "SeamlessM4TTokenizer" if is_sentencepiece_available() else None, - "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "seamless_m4t_v2", - ( - "SeamlessM4TTokenizer" if is_sentencepiece_available() else None, - "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "shieldgemma2", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("siglip", ("SiglipTokenizer" if is_sentencepiece_available() else None, None)), - ( - "siglip2", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("smollm3", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("smolvlm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), - ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), - ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), - ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")), - ( - "squeezebert", - ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None), - ), - ("stablelm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ("starcoder2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), - ( - "switch_transformers", - ( - "T5Tokenizer" if is_sentencepiece_available() else None, - "T5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "t5", - ( - "T5Tokenizer" if is_sentencepiece_available() else None, - "T5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "t5gemma", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("tapas", ("TapasTokenizer", None)), - ("trocr", ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None)), - ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ( - "udop", - ( - "UdopTokenizer" if is_sentencepiece_available() else None, - "UdopTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "umt5", - ( - "T5Tokenizer" if is_sentencepiece_available() else None, - "T5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("video_llama_3", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), - ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), - ( - "vision_text_dual_encoder", - ("PreTrainedTokenizer", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), - ), - ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), - ("vits", ("VitsTokenizer", None)), + "MistralCommonBackend" + if is_mistral_common_available() + else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ), + ("plbart", "PLBartTokenizer" if is_tokenizers_available() else None), + ("prophetnet", "ProphetNetTokenizer"), + ("qdqbert", "BertTokenizer" if is_tokenizers_available() else None), + ("qwen2", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen2_5_omni", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen2_5_vl", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen2_audio", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen2_moe", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen2_vl", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3_moe", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3_next", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3_omni_moe", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3_vl", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("qwen3_vl_moe", "Qwen2TokenizerFast" if is_tokenizers_available() else None), + ("rag", "RagTokenizer"), + ("realm", "BertTokenizer" if is_tokenizers_available() else None), + ("recurrent_gemma", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("reformer", "ReformerTokenizer" if is_tokenizers_available() else None), + ("rembert", "RemBertTokenizer" if is_tokenizers_available() else None), + ("retribert", "BertTokenizer" if is_tokenizers_available() else None), + ("roberta", "RobertaTokenizer"), + ("roberta-prelayernorm", "RobertaTokenizer"), + ("roc_bert", "RoCBertTokenizer"), + ("roformer", "RoFormerTokenizerFast" if is_tokenizers_available() else None), + ("rwkv", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("seamless_m4t", "SeamlessM4TTokenizer" if is_tokenizers_available() else None), + ("seamless_m4t_v2", "SeamlessM4TTokenizer" if is_tokenizers_available() else None), + ("shieldgemma2", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("siglip", "SiglipTokenizer" if is_sentencepiece_available() else None), + ("siglip2", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("smollm3", "PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ("speech_to_text", "Speech2TextTokenizer" if is_sentencepiece_available() else None), + ("speecht5", "SpeechT5Tokenizer" if is_sentencepiece_available() else None), + ("splinter", "SplinterTokenizer"), + ("squeezebert", "BertTokenizer" if is_tokenizers_available() else None), + ("stablelm", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("starcoder2", "GPT2Tokenizer" if is_tokenizers_available() else None), + ("switch_transformers", "T5Tokenizer" if is_tokenizers_available() else None), + ("t5", "T5Tokenizer" if is_tokenizers_available() else None), + ("t5gemma", "GemmaTokenizerFast" if is_tokenizers_available() else None), + ("tapas", "TapasTokenizer"), + ("trocr", "XLMRobertaTokenizer" if is_tokenizers_available() else None), + ("tvp", "BertTokenizer" if is_tokenizers_available() else None), + ("udop", "UdopTokenizer" if is_tokenizers_available() else None), + ("umt5", "T5Tokenizer" if is_tokenizers_available() else None), + ("video_llava", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("vilt", "BertTokenizer" if is_tokenizers_available() else None), + ("vipllava", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("visual_bert", "BertTokenizer" if is_tokenizers_available() else None), + ("vits", "VitsTokenizer"), ( "voxtral", - ( - "MistralCommonTokenizer" if is_mistral_common_available() else None, - "PreTrainedTokenizerFast" if is_tokenizers_available() and not is_mistral_common_available() else None, - ), - ), - ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), - ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)), - ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), - ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), - ("wav2vec2_with_lm", ("Wav2Vec2CTCTokenizer", None)), - ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)), - ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), - ( - "xglm", - ( - "XGLMTokenizer" if is_sentencepiece_available() else None, - "XGLMTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("xlm", ("XLMTokenizer", None)), - ( - "xlm-roberta", - ( - "XLMRobertaTokenizer" if is_sentencepiece_available() else None, - "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "xlm-roberta-xl", - ( - "XLMRobertaTokenizer" if is_sentencepiece_available() else None, - "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "xlnet", - ( - "XLNetTokenizer" if is_sentencepiece_available() else None, - "XLNetTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("xlstm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), - ( - "xmod", - ( - "XLMRobertaTokenizer" if is_sentencepiece_available() else None, - "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "yoso", - ( - "AlbertTokenizer" if is_sentencepiece_available() else None, - "AlbertTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "zamba", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "zamba2", - ( - "LlamaTokenizer" if is_sentencepiece_available() else None, - "LlamaTokenizerFast" if is_tokenizers_available() else None, - ), - ), + "MistralCommonBackend" + if is_mistral_common_available() + else ("PreTrainedTokenizerFast" if is_tokenizers_available() else None), + ), + ("wav2vec2", "Wav2Vec2CTCTokenizer"), + ("wav2vec2-bert", "Wav2Vec2CTCTokenizer"), + ("wav2vec2-conformer", "Wav2Vec2CTCTokenizer"), + ("wav2vec2_phoneme", "Wav2Vec2PhonemeCTCTokenizer"), + ("whisper", "WhisperTokenizer" if is_tokenizers_available() else None), + ("xclip", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ("xglm", "XGLMTokenizer" if is_tokenizers_available() else None), + ("xlm", "XLMTokenizer"), + ("xlm-roberta", "XLMRobertaTokenizer" if is_tokenizers_available() else None), + ("xlm-roberta-xl", "XLMRobertaTokenizer" if is_tokenizers_available() else None), + ("xlnet", "XLNetTokenizer" if is_tokenizers_available() else None), + ("xlstm", "GPTNeoXTokenizerFast" if is_tokenizers_available() else None), + ("xmod", "XLMRobertaTokenizerFast" if is_tokenizers_available() else None), + ("yoso", "AlbertTokenizer" if is_tokenizers_available() else None), + ("zamba", "LlamaTokenizerFast" if is_tokenizers_available() else None), + ("zamba2", "LlamaTokenizerFast" if is_tokenizers_available() else None), ] ) @@ -826,14 +353,38 @@ CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + with open(vocab_file, "r", encoding="utf-8") as reader: + return json.load(reader) + + +def load_merges(merges_file): + """Loads a merges file into a list.""" + merges = [] + with open(merges_file, "r", encoding="utf-8") as reader: + for line in reader: + line = line.strip() + if line and not line.startswith("#"): + merges.append(tuple(line.split())) + return merges + + def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: + if class_name in REGISTERED_FAST_ALIASES: + return REGISTERED_FAST_ALIASES[class_name] + + if class_name in REGISTERED_TOKENIZER_CLASSES: + return REGISTERED_TOKENIZER_CLASSES[class_name] + if class_name == "PreTrainedTokenizerFast": - return PreTrainedTokenizerFast + return TokenizersBackend - for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): - if class_name in tokenizers: + # V5: TOKENIZER_MAPPING_NAMES now maps to single strings, not tuples + for module_name, tokenizer_class in TOKENIZER_MAPPING_NAMES.items(): + if tokenizer_class == class_name: module_name = model_type_to_module_name(module_name) - if module_name in ["mistral", "mixtral", "ministral"] and class_name == "MistralCommonTokenizer": + if module_name in ["mistral", "mixtral", "ministral"] and class_name == "MistralCommonBackend": module = importlib.import_module(".tokenization_mistral_common", "transformers") else: module = importlib.import_module(f".{module_name}", "transformers.models") @@ -842,11 +393,11 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: except AttributeError: continue - for tokenizers in TOKENIZER_MAPPING._extra_content.values(): - for tokenizer in tokenizers: - if getattr(tokenizer, "__name__", None) == class_name: - return tokenizer + for tokenizer in TOKENIZER_MAPPING._extra_content.values(): + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer + # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main # init and we return the proper dummy to get an appropriate error message. main_module = importlib.import_module("transformers") @@ -856,6 +407,402 @@ def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]: return None +def _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs): + # Delegate to shared helper to avoid duplication + return find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs) + + +def _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs): + """ + Load a tokenizer using only the tokenizers backend (no SentencePiece fallback). + + This function attempts to load with the following priority: + 1. If tokenizer.json exists, load directly + 2. If any .model file (SPM) exists, try extracting vocab and merges + 3. If vocab.json and merges.txt exist, load with those + 4. If vocab.txt exists (WordPiece models), load with that + + Args: + tokenizer_class: The tokenizer class to instantiate + pretrained_model_name_or_path: Path or model id + inputs: Additional positional arguments for tokenizer init + kwargs: Additional keyword arguments + + Returns: + An instantiated tokenizer object + + Raises: + ValueError: If tokenizer could not be loaded with tokenizers backend + """ + files_loaded = [] + + # Try tokenizer.json first + try: + tokenizer_json_exists = has_file( + pretrained_model_name_or_path, + "tokenizer.json", + revision=kwargs.get("revision"), + token=kwargs.get("token"), + cache_dir=kwargs.get("cache_dir"), + local_files_only=kwargs.get("local_files_only", False), + ) + except Exception: + tokenizer_json_exists = False + + if tokenizer_json_exists: + files_loaded.append("tokenizer.json") + kwargs["backend"] = "tokenizers" + kwargs["files_loaded"] = files_loaded + # Some old models have uploaded a tokenizer.json but haven't updated tokenizer_config.json to point to the correct tokenizer class + tokenizer_class = ( + TokenizersBackend + if tokenizer_class.__name__ in ("PythonBackend", "PreTrainedTokenizer") + else tokenizer_class + ) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + # Try tekken.json (Mistral format) + try: + if has_file( + pretrained_model_name_or_path, + "tekken.json", + revision=kwargs.get("revision"), + token=kwargs.get("token"), + cache_dir=kwargs.get("cache_dir"), + local_files_only=kwargs.get("local_files_only", False), + ): + from ...integrations.mistral import convert_tekken_tokenizer + + tekken_file = cached_file( + pretrained_model_name_or_path, + "tekken.json", + **{ + k: v + for k, v in kwargs.items() + if k + in ["cache_dir", "force_download", "proxies", "token", "revision", "local_files_only", "subfolder"] + }, + ) + if tekken_file is not None: + files_loaded.append("tekken.json") + kwargs["backend"] = "tokenizers" + kwargs["files_loaded"] = files_loaded + return convert_tekken_tokenizer(tekken_file) + except (ImportError, Exception): + pass + + # Try extracting from SentencePiece model + spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs) + if spm_file is not None: + try: + resolved_spm = cached_file( + pretrained_model_name_or_path, + spm_file, + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + proxies=kwargs.get("proxies"), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + subfolder=kwargs.get("subfolder", ""), + ) + except Exception: + resolved_spm = None + + if resolved_spm is not None: + try: + from ...tokenization_utils_sentencepiece import SentencePieceExtractor + + fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class)) + if "vocab" in fast_sig.parameters: + try: + vocab_ids, vocab_scores, merges = SentencePieceExtractor(resolved_spm).extract() + files_loaded.append(spm_file) + kwargs["backend"] = "tokenizers" + kwargs["files_loaded"] = files_loaded + # If tokenizer needs both vocab and merges (BPE models) + if "merges" in fast_sig.parameters: + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, *inputs, vocab=vocab_scores, merges=merges, **kwargs + ) + # If tokenizer only needs vocab (Unigram models like NLLB, SeamlessM4T) + else: + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, *inputs, vocab=vocab_scores, **kwargs + ) + except Exception: + pass + except ImportError as e: + if "sentencepiece" in str(e).lower() or "SentencePiece" in str(e): + raise ImportError( + f"This checkpoint only contains a SentencePiece model file ({spm_file}), but the `sentencepiece` library is not installed. " + f"Please install sentencepiece to load this tokenizer: `pip install sentencepiece`" + ) from e + raise + except Exception: + pass + + vocab, merges, loaded = load_vocab_and_merges(pretrained_model_name_or_path, **kwargs) + if vocab is not None: + files_loaded.extend(loaded) + if issubclass(tokenizer_class, PreTrainedTokenizer): + kwargs["backend"] = "python" + else: + kwargs["backend"] = "tokenizers" + kwargs["files_loaded"] = files_loaded + if merges is not None: + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, *inputs, vocab=vocab, merges=merges, **kwargs + ) + else: + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs) + + # Try vocab.txt (WordPiece models like SplinterTokenizer) + try: + resolved_vocab_txt = cached_file( + pretrained_model_name_or_path, + "vocab.txt", + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + proxies=kwargs.get("proxies"), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + subfolder=kwargs.get("subfolder", ""), + ) + except Exception: + resolved_vocab_txt = None + + if resolved_vocab_txt is not None: + try: + fast_sig = inspect.signature(getattr(tokenizer_class, "__init__", tokenizer_class)) + if "vocab" in fast_sig.parameters: + # Load vocab.txt: each line is a token, line number is the ID + vocab = OrderedDict() + with open(resolved_vocab_txt, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + files_loaded.append("vocab.txt") + kwargs["backend"] = "tokenizers" + kwargs["files_loaded"] = files_loaded + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, vocab=vocab, **kwargs) + except Exception: + pass + + # If all methods failed, raise an error + raise ValueError( + f"Could not load tokenizer from {pretrained_model_name_or_path} using tokenizers backend. " + "No tokenizer.json, tekken.json, vocab.json+merges.txt, vocab.txt, or compatible SentencePiece model found." + ) + + +def _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs): + """ + Try to load a tokenizer with backend selection. + + This function routes to the appropriate backend based on the 'backend' parameter: + - "tokenizers" (default): Uses HuggingFace tokenizers library backend + - "sentencepiece": Uses SentencePiece backend + + For the tokenizers backend, attempts to load with the following priority: + 1. If tokenizer.json exists, load directly + 2. If any .model file (SPM) exists, try extracting vocab and merges + 3. If vocab.json and merges.txt exist, load with those + 4. Fallback to SentencePieceBackend if available + + Args: + tokenizer_class: The tokenizer class to instantiate (can be None) + pretrained_model_name_or_path: Path or model id + inputs: Additional positional arguments for tokenizer init + kwargs: Additional keyword arguments (may include 'backend' parameter, defaults to "tokenizers") + + Returns: + An instantiated tokenizer object + + Raises: + ValueError: If no tokenizer could be loaded + """ + # Extract the backend parameter - default to "tokenizers" to prioritize tokenizers backend + backend = kwargs.pop("backend", "tokenizers") + + # Validate backend parameter + if backend not in ["sentencepiece", "tokenizers"]: + logger.warning( + f"Invalid backend '{backend}' specified. Valid options are 'tokenizers' or 'sentencepiece'. " + "Defaulting to 'tokenizers' backend." + ) + backend = "tokenizers" + + # Route to SentencePiece backend if requested + if backend == "sentencepiece": + if SentencePieceBackend is None: + raise ValueError( + "SentencePiece backend was requested but sentencepiece is not installed. " + "Please install it with: pip install sentencepiece" + ) + logger.info("Loading tokenizer with SentencePiece backend") + # Track files loaded for SentencePiece backend + spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs) + files_loaded = [spm_file] if spm_file else [] + kwargs["backend"] = "sentencepiece" + kwargs["files_loaded"] = files_loaded + # Resolve the SPM file path and pass it as vocab_file + if spm_file is not None: + resolved_vocab_file = cached_file( + pretrained_model_name_or_path, + spm_file, + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + proxies=kwargs.get("proxies"), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + subfolder=kwargs.get("subfolder", ""), + ) + kwargs["vocab_file"] = resolved_vocab_file + if isinstance(tokenizer_class, type) and issubclass(tokenizer_class, SentencePieceBackend): + logger.info("Loading tokenizer with SentencePiece backend using tokenizer class") + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + # Route to tokenizers backend (default) + if backend == "tokenizers": + if tokenizer_class is not None: + # Check if tokenizer_class inherits from PreTrainedTokenizer (but not from TokenizersBackend/SentencePieceBackend) + # These are edge cases with custom logic (e.g., BioGptTokenizer with Moses tokenization) + from ...tokenization_python import PreTrainedTokenizer + + # Build list of backend classes to check against + backend_classes = [TokenizersBackend] if TokenizersBackend else [] + if SentencePieceBackend: + backend_classes.append(SentencePieceBackend) + + # Check if it's a custom PreTrainedTokenizer (not a backend class) + is_custom_pre_trained = ( + isinstance(tokenizer_class, type) + and issubclass(tokenizer_class, PreTrainedTokenizer) + and not any(issubclass(tokenizer_class, bc) for bc in backend_classes) + and tokenizer_class.__name__ not in ("PythonBackend", "PreTrainedTokenizer") + ) + + # Check if it's a completely custom tokenizer (not PreTrainedTokenizer, not backend class) + # e.g., MistralCommonBackend which has its own from_pretrained logic + inherits_from_backend = isinstance(tokenizer_class, type) and any( + bc and issubclass(tokenizer_class, bc) for bc in backend_classes + ) + is_completely_custom = ( + isinstance(tokenizer_class, type) + and not issubclass(tokenizer_class, PythonBackend) + and not inherits_from_backend + ) + + if is_custom_pre_trained: + logger.info("Loading tokenizer with custom PreTrainedTokenizer backend (edge case)") + # Track the backend type for custom tokenizers + kwargs["backend"] = "custom" + kwargs["files_loaded"] = [] # Custom tokenizers may load various files + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + if is_completely_custom: + # For completely custom tokenizers (like MistralCommonBackend), try calling from_pretrained directly + logger.info("Loading tokenizer with custom tokenizer class (non-PreTrainedTokenizer)") + # Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept + custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]} + custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs) + + if TokenizersBackend is None: + raise ValueError( + "Tokenizers backend is the default but tokenizers library is not installed. " + "Please install it with: pip install tokenizers" + ) + logger.info("Loading tokenizer with tokenizers backend") + try: + return _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs) + except ValueError as e: + # If tokenizers backend fails, try falling back to SentencePiece backend if available + spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs) + if spm_file is not None and SentencePieceBackend is not None: + logger.info( + f"Tokenizers backend failed: {e}. " + f"Falling back to SentencePieceBackend since {spm_file} file was found." + ) + files_loaded = [spm_file] + kwargs["backend"] = "sentencepiece" + kwargs["files_loaded"] = files_loaded + # Resolve the SPM file path and pass it as vocab_file + resolved_vocab_file = cached_file( + pretrained_model_name_or_path, + spm_file, + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + proxies=kwargs.get("proxies"), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + subfolder=kwargs.get("subfolder", ""), + ) + kwargs["vocab_file"] = resolved_vocab_file + if tokenizer_class is not None and issubclass(tokenizer_class, SentencePieceBackend): + logger.info( + "Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend." + ) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + # If no fallback available, try calling tokenizer class directly as last resort + if hasattr(tokenizer_class, "from_pretrained"): + logger.info( + f"Tokenizers backend failed: {e}. Trying to load tokenizer directly from tokenizer class." + ) + # Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept + custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]} + custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs) + # Re-raise if no fallback options available + raise + + # If no tokenizer class but tokenizers backend requested, fall back to SentencePiece if available + spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs) + if spm_file is not None and SentencePieceBackend is not None: + logger.info( + f"Tokenizers backend was requested but no tokenizer class found. " + f"Falling back to SentencePieceBackend since {spm_file} file was found." + ) + files_loaded = [spm_file] + kwargs["backend"] = "sentencepiece" + kwargs["files_loaded"] = files_loaded + # Resolve the SPM file path and pass it as vocab_file + resolved_vocab_file = cached_file( + pretrained_model_name_or_path, + spm_file, + cache_dir=kwargs.get("cache_dir"), + force_download=kwargs.get("force_download", False), + proxies=kwargs.get("proxies"), + token=kwargs.get("token"), + revision=kwargs.get("revision"), + local_files_only=kwargs.get("local_files_only", False), + subfolder=kwargs.get("subfolder", ""), + ) + kwargs["vocab_file"] = resolved_vocab_file + if ( + tokenizer_class is not None + and SentencePieceBackend is not None + and issubclass(tokenizer_class, SentencePieceBackend) + ): + logger.info( + "Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend." + ) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + raise ValueError( + f"Could not load tokenizer from {pretrained_model_name_or_path}. " + "No tokenizer class could be determined and no SentencePiece model found." + ) + + def get_tokenizer_config( pretrained_model_name_or_path: Union[str, os.PathLike[str]], cache_dir: Optional[Union[str, os.PathLike[str]]] = None, @@ -970,7 +917,7 @@ def __init__(self): @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES) def from_pretrained( cls, pretrained_model_name_or_path, *inputs, **kwargs - ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + ) -> Union[TokenizersBackend, SentencePieceBackend]: r""" Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary. @@ -1010,12 +957,12 @@ def from_pretrained( subfolder (`str`, *optional*): In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for facebook/rag-token-base), specify it here. - use_fast (`bool`, *optional*, defaults to `True`): - Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for - a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer - is returned instead. tokenizer_type (`str`, *optional*): Tokenizer type to be loaded. + backend (`str`, *optional*, defaults to `"tokenizers"`): + Backend to use for tokenization. Valid options are: + - `"tokenizers"`: Use the HuggingFace tokenizers library backend (default) + - `"sentencepiece"`: Use the SentencePiece backend trust_remote_code (`bool`, *optional*, defaults to `False`): Whether or not to allow for custom models defined on the Hub in their own modeling files. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will @@ -1041,38 +988,45 @@ def from_pretrained( >>> # Download vocabulary from huggingface.co and define model-specific arguments >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True) + + >>> # Explicitly use the tokenizers backend + >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="tokenizers") + + >>> # Explicitly use the sentencepiece backend + >>> tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer", backend="sentencepiece") ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + logger.warning( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token") is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + config = kwargs.pop("config", None) kwargs["_from_auto"] = True - use_fast = kwargs.pop("use_fast", True) + # V5: Always use fast tokenizers, ignore use_fast parameter + _ = kwargs.pop("use_fast", None) tokenizer_type = kwargs.pop("tokenizer_type", None) trust_remote_code = kwargs.pop("trust_remote_code", None) gguf_file = kwargs.get("gguf_file") # First, let's see whether the tokenizer_type is passed so that we can leverage it if tokenizer_type is not None: - tokenizer_class = None - tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None) + tokenizer_class_name = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None) - if tokenizer_class_tuple is None: + if tokenizer_class_name is None: raise ValueError( f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of " f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}." ) - tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple - - if use_fast: - if tokenizer_fast_class_name is not None: - tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name) - else: - logger.warning( - "`use_fast` is set to `True` but the tokenizer class does not have a fast version. " - " Falling back to the slow version." - ) - if tokenizer_class is None: - tokenizer_class = tokenizer_class_from_name(tokenizer_class_name) + tokenizer_class = tokenizer_class_from_name(tokenizer_class_name) if tokenizer_class is None: raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.") @@ -1107,6 +1061,13 @@ def from_pretrained( if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map: tokenizer_auto_map = config.auto_map["AutoTokenizer"] + if ( + config_tokenizer_class is not None + and config_tokenizer_class != "PreTrainedTokenizerFast" + and "Fast" in config_tokenizer_class + ): + config_tokenizer_class = config_tokenizer_class[:-4] + has_remote_code = tokenizer_auto_map is not None has_local_code = type(config) in TOKENIZER_MAPPING or ( config_tokenizer_class is not None @@ -1116,7 +1077,8 @@ def from_pretrained( ) ) if has_remote_code: - if use_fast and tokenizer_auto_map[1] is not None: + # V5: Always prefer fast tokenizer (index 1), fallback to slow (index 0) + if tokenizer_auto_map[1] is not None: class_ref = tokenizer_auto_map[1] else: class_ref = tokenizer_auto_map[0] @@ -1136,18 +1098,16 @@ def from_pretrained( pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs ) elif config_tokenizer_class is not None: - tokenizer_class = None - if use_fast and not config_tokenizer_class.endswith("Fast"): - tokenizer_class_candidate = f"{config_tokenizer_class}Fast" - tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) - if tokenizer_class is None: + fast_tokenizer_class = None + if fast_tokenizer_class is None: tokenizer_class_candidate = config_tokenizer_class tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) - if tokenizer_class is None: - raise ValueError( - f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." - ) - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + if tokenizer_class is None and not tokenizer_class_candidate.endswith("Fast"): + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate + "Fast") + else: + tokenizer_class = fast_tokenizer_class + + return _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs) # Otherwise we have to be creative. # if model is an encoder decoder, the encoder tokenizer class is used by default @@ -1163,18 +1123,17 @@ def from_pretrained( model_type = config_class_to_model_type(type(config).__name__) if model_type is not None: - tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] + tokenizer_class = TOKENIZER_MAPPING[type(config)] - if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): - return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + if tokenizer_class is not None: + return _try_load_tokenizer_with_fallbacks( + tokenizer_class, pretrained_model_name_or_path, inputs, kwargs + ) else: - if tokenizer_class_py is not None: - return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - else: - raise ValueError( - "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " - "in order to use this tokenizer." - ) + raise ValueError( + "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " + "in order to use this tokenizer." + ) raise ValueError( f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n" @@ -1182,48 +1141,36 @@ def from_pretrained( ) @staticmethod - def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False): + def register( + config_class, tokenizer_class=None, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False + ): """ Register a new tokenizer in this mapping. - Args: config_class ([`PreTrainedConfig`]): The configuration corresponding to the model to register. - slow_tokenizer_class ([`PretrainedTokenizer`], *optional*): - The slow tokenizer to register. - fast_tokenizer_class ([`PreTrainedTokenizerFast`], *optional*): - The fast tokenizer to register. + tokenizer_class: The tokenizer class to register (V5 - preferred parameter). + slow_tokenizer_class: (Deprecated) The slow tokenizer to register. + fast_tokenizer_class: (Deprecated) The fast tokenizer to register. """ - if slow_tokenizer_class is None and fast_tokenizer_class is None: - raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class") - if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast): - raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.") - if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer): - raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.") + if tokenizer_class is None: + # Legacy: prefer fast over slow + if fast_tokenizer_class is not None: + tokenizer_class = fast_tokenizer_class + elif slow_tokenizer_class is not None: + tokenizer_class = slow_tokenizer_class + else: + raise ValueError("You need to pass a `tokenizer_class`") - if ( - slow_tokenizer_class is not None - and fast_tokenizer_class is not None - and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast) - and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class - ): - raise ValueError( - "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not " - "consistent with the slow tokenizer class you passed (fast tokenizer has " - f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those " - "so they match!" - ) + for candidate in (slow_tokenizer_class, fast_tokenizer_class, tokenizer_class): + if candidate is not None: + REGISTERED_TOKENIZER_CLASSES[candidate.__name__] = candidate - # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones. - if config_class in TOKENIZER_MAPPING._extra_content: - existing_slow, existing_fast = TOKENIZER_MAPPING[config_class] - if slow_tokenizer_class is None: - slow_tokenizer_class = existing_slow - if fast_tokenizer_class is None: - fast_tokenizer_class = existing_fast + if slow_tokenizer_class is not None and fast_tokenizer_class is not None: + REGISTERED_FAST_ALIASES[slow_tokenizer_class.__name__] = fast_tokenizer_class - TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok) + TOKENIZER_MAPPING.register(config_class, tokenizer_class, exist_ok=exist_ok) __all__ = ["TOKENIZER_MAPPING", "AutoTokenizer"] diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py index d268fb7d2b86..3cf9c3451895 100644 --- a/src/transformers/models/bart/__init__.py +++ b/src/transformers/models/bart/__init__.py @@ -18,10 +18,9 @@ if TYPE_CHECKING: + from ..roberta.tokenization_roberta import RobertaTokenizer as BartTokenizer from .configuration_bart import * from .modeling_bart import * - from .tokenization_bart import * - from .tokenization_bart_fast import * else: import sys diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index f674afe1a412..208d4e39131e 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -1,393 +1,23 @@ -# coding=utf-8 -# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# Copyright 2024 The HuggingFace Team. All rights reserved. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os -from functools import lru_cache -from typing import Optional - -import regex as re - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer -from ...utils import logging - - -logger = logging.get_logger(__name__) - - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} - -# See all BART models at https://huggingface.co/models?filter=bart - - -@lru_cache -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control - characters the bpe code barfs on. - - The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab - if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for - decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup - tables between utf-8 bytes and unicode strings. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -def get_pairs(word): - """ - Return set of symbol pairs in a word. - - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - return pairs - - -class BartTokenizer(PreTrainedTokenizer): - """ - Constructs a BART tokenizer, which is smilar to the ROBERTa tokenizer, using byte-level Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - ```python - >>> from transformers import BartTokenizer - - >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") - >>> tokenizer("Hello world")["input_ids"] - [0, 31414, 232, 2] - - >>> tokenizer(" Hello world")["input_ids"] - [0, 20920, 232, 2] - ``` - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). - - - - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (BART tokenizer detect beginning of words by the preceding space). - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - - def __init__( - self, - vocab_file, - merges_file, - errors="replace", - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - add_prefix_space=False, - **kwargs, - ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token - cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - with open(vocab_file, encoding="utf-8") as vocab_handle: - self.encoder = json.load(vocab_handle) - self.decoder = {v: k for k, v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with open(merges_file, encoding="utf-8") as merges_handle: - bpe_merges = merges_handle.read().split("\n")[1:-1] - bpe_merges = [tuple(merge.split()) for merge in bpe_merges] - self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - self.cache = {} - self.add_prefix_space = add_prefix_space - - # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions - self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") - - super().__init__( - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - **kwargs, - ) - - @property - def vocab_size(self): - return len(self.encoder) - - def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token) - pairs = get_pairs(word) - - if not pairs: - return token - - while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - except ValueError: - new_word.extend(word[i:]) - break - else: - new_word.extend(word[i:j]) - i = j - - if word[i] == first and i < len(word) - 1 and word[i + 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = " ".join(word) - self.cache[token] = word - return word - - def _tokenize(self, text): - """Tokenize a string.""" - bpe_tokens = [] - for token in re.findall(self.pat, text): - token = "".join( - self.byte_encoder[b] for b in token.encode("utf-8") - ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) - bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) - return bpe_tokens - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.encoder.get(token, self.encoder.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.decoder.get(index) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - text = "".join(tokens) - text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) - return text - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - merge_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] - ) - - with open(vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") - - index = 0 - with open(merge_file, "w", encoding="utf-8") as writer: - writer.write("#version: 0.2\n") - for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!" - ) - index = token_index - writer.write(" ".join(bpe_tokens) + "\n") - index += 1 - - return vocab_file, merge_file - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BART sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not - make use of token type ids, therefore a list of zeros is returned. +# This source code is licensed under the Apache 2.0 license found in the +# LICENSE file in the root directory of this source tree. - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. +""" +Compatibility shims for BART tokenizers in v5. - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] +In v5 we consolidate on the tokenizers-library backend and remove separate +"slow" vs "fast" implementations. BART uses the same byte-level BPE +tokenizer as RoBERTa, so we expose `BartTokenizer` and `BartTokenizerFast` +as aliases to `RobertaTokenizer` to preserve the public API expected by +existing code and tests. +""" - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] +from ..roberta.tokenization_roberta import RobertaTokenizer as _RobertaTokenizer - def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): - add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) - if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): - text = " " + text - return (text, kwargs) +# Public aliases maintained for backwards compatibility +BartTokenizer = _RobertaTokenizer +BartTokenizerFast = _RobertaTokenizer -__all__ = ["BartTokenizer"] +__all__ = ["BartTokenizer", "BartTokenizerFast"] diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py deleted file mode 100644 index 88b002f59529..000000000000 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ /dev/null @@ -1,271 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -from typing import Optional - -from tokenizers import processors - -from ...tokenization_utils_base import AddedToken, BatchEncoding -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging -from .tokenization_bart import BartTokenizer - - -logger = logging.get_logger(__name__) - - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} - -# See all BART models at https://huggingface.co/models?filter=bart - - -class BartTokenizerFast(PreTrainedTokenizerFast): - r""" - Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer, - using byte-level Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - ```python - >>> from transformers import BartTokenizerFast - - >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") - >>> tokenizer("Hello world")["input_ids"] - [0, 31414, 232, 2] - - >>> tokenizer(" Hello world")["input_ids"] - [0, 20920, 232, 2] - ``` - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. - - - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (BART tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether the post processing step should trim offsets to avoid including whitespaces. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = BartTokenizer - - def __init__( - self, - vocab_file=None, - merges_file=None, - tokenizer_file=None, - errors="replace", - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - add_prefix_space=False, - trim_offsets=True, - **kwargs, - ): - # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens` - mask_token = ( - AddedToken(mask_token, lstrip=True, normalized=True, special=True) - if isinstance(mask_token, str) - else mask_token - ) - super().__init__( - vocab_file, - merges_file, - tokenizer_file=tokenizer_file, - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - trim_offsets=trim_offsets, - **kwargs, - ) - - # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__` - tokenizer_component = "post_processor" - tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) - if tokenizer_component_instance: - state = json.loads(tokenizer_component_instance.__getstate__()) - - # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` - if "sep" in state: - state["sep"] = tuple(state["sep"]) - if "cls" in state: - state["cls"] = tuple(state["cls"]) - - changes_to_apply = False - - if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: - state["add_prefix_space"] = add_prefix_space - changes_to_apply = True - - if state.get("trim_offsets", trim_offsets) != trim_offsets: - state["trim_offsets"] = trim_offsets - changes_to_apply = True - - if changes_to_apply: - component_class = getattr(processors, state.pop("type")) - new_value = component_class(**state) - setattr(self.backend_tokenizer, tokenizer_component, new_value) - - @property - def mask_token(self) -> str: - """ - `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not - having been set. - - BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily - comprise the space before the **. - """ - if self._mask_token is None: - if self.verbose: - logger.error("Using mask_token, but it is not set yet.") - return None - return str(self._mask_token) - - @mask_token.setter - def mask_token(self, value): - """ - Overriding the default behavior of the mask token to have it eat the space before it. - - This is needed to preserve backward compatibility with all the previously used models based on Bart. - """ - # Mask token behave like a normal word, i.e. include the space before it - # So we set lstrip to True - value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value - self._mask_token = value - - def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - - if is_split_into_words and not self.add_prefix_space: - raise ValueError( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._batch_encode_plus(*args, **kwargs) - - def _encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - - if is_split_into_words and not self.add_prefix_space: - raise ValueError( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._encode_plus(*args, **kwargs) - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - files = self._tokenizer.model.save(save_directory, name=filename_prefix) - return tuple(files) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] - if token_ids_1 is None: - return output - - return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not - make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - -__all__ = ["BartTokenizerFast"] diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py index 323fe2fe8af9..c9e11571fc6d 100644 --- a/src/transformers/models/barthez/__init__.py +++ b/src/transformers/models/barthez/__init__.py @@ -19,7 +19,6 @@ if TYPE_CHECKING: from .tokenization_barthez import * - from .tokenization_barthez_fast import * else: import sys diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index bc583e0cd5dc..77d0b65d15b8 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -14,40 +14,31 @@ # limitations under the License """Tokenization classes for the BARThez model.""" -import os -from shutil import copyfile -from typing import Any, Optional +from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers +from tokenizers.models import Unigram -import sentencepiece as spm - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging -from ...utils.import_utils import requires logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} SPIECE_UNDERLINE = "▁" -# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this. - -@requires(backends=("sentencepiece",)) -class BarthezTokenizer(PreTrainedTokenizer): +class BarthezTokenizer(TokenizersBackend): """ - Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a BARThez tokenizer. Based on + Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast" BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. bos_token (`str`, *optional*, defaults to `""`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. @@ -83,33 +74,22 @@ class BarthezTokenizer(PreTrainedTokenizer): mask_token (`str`, *optional*, defaults to `""`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - sp_model_kwargs (`dict`, *optional*): - Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for - SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, - to set: - - - `enable_sampling`: Enable subword regularization. - - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - - - `nbest_size = {0,1}`: No sampling is performed. - - `nbest_size > 1`: samples from the nbest_size results. - - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - using forward-filtering-and-backward-sampling algorithm. - - - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - BPE-dropout. - - Attributes: - sp_model (`SentencePieceProcessor`): - The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + vocab_file (`str`, *optional*): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None def __init__( self, - vocab_file, bos_token="", eos_token="", sep_token="", @@ -117,18 +97,46 @@ def __init__( unk_token="", pad_token="", mask_token="", - sp_model_kwargs: Optional[dict[str, Any]] = None, + vocab_file=None, + vocab=None, + add_prefix_space=True, **kwargs, - ) -> None: - # Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way - mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.add_prefix_space = add_prefix_space + self.vocab_file = vocab_file - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + if vocab is not None: + self._vocab = vocab + else: + self._vocab = [ + (str(pad_token), 0.0), + (str(unk_token), 0.0), + (str(cls_token), 0.0), + (str(sep_token), 0.0), + (str(mask_token), 0.0), + ] + + self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=3, byte_fallback=False)) + + self._tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Replace("\n", " "), + normalizers.Replace("\r", " "), + normalizers.Replace("\t", " "), + normalizers.Replace(Regex(r" {2,}"), " "), + normalizers.NFC(), + normalizers.Strip(left=False, right=True), + ] + ) + prepend_scheme = "always" if add_prefix_space else "never" + self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme) + self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme) - self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) + tokenizer_object = self._tokenizer super().__init__( + tokenizer_object=tokenizer_object, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, @@ -136,156 +144,9 @@ def __init__( cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, + add_prefix_space=add_prefix_space, **kwargs, ) - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BARThez sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - @property - def vocab_size(self): - return len(self.sp_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text: str) -> list[str]: - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.sp_model.IdToPiece(index) - - # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - __all__ = ["BarthezTokenizer"] diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py deleted file mode 100644 index 64050ca8848f..000000000000 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ /dev/null @@ -1,193 +0,0 @@ -# coding=utf-8 -# Copyright 2020 Ecole Polytechnique and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -"""Tokenization classes for the BARThez model.""" - -import os -from shutil import copyfile -from typing import Optional - -from ...tokenization_utils import AddedToken -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import is_sentencepiece_available, logging - - -if is_sentencepiece_available(): - from .tokenization_barthez import BarthezTokenizer -else: - BarthezTokenizer = None - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} - - -SPIECE_UNDERLINE = "▁" - - -class BarthezTokenizerFast(PreTrainedTokenizerFast): - """ - Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast" BARThez tokenizer. Based on - [SentencePiece](https://github.com/google/sentencepiece). - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = BarthezTokenizer - - def __init__( - self, - vocab_file=None, - tokenizer_file=None, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - **kwargs, - ): - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - cls_token=cls_token, - pad_token=pad_token, - mask_token=mask_token, - **kwargs, - ) - - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BARThez sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " - "tokenizer." - ) - - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -__all__ = ["BarthezTokenizerFast"] diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index 9b16357a83ec..a557c1352aa5 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -18,22 +18,19 @@ from shutil import copyfile from typing import Any, Optional -import sentencepiece as spm - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken +from ...tokenization_utils_sentencepiece import SentencePieceBackend from ...utils import logging from ...utils.import_utils import requires logger = logging.get_logger(__name__) -SPIECE_UNDERLINE = "▁" - VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"} @requires(backends=("sentencepiece",)) -class BartphoTokenizer(PreTrainedTokenizer): +class BartphoTokenizer(SentencePieceBackend): """ Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). @@ -105,6 +102,7 @@ class BartphoTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] + is_fast = False def __init__( self, @@ -123,15 +121,9 @@ def __init__( # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - self.vocab_file = vocab_file self.monolingual_vocab_file = monolingual_vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) # Load the reduced vocab - # Keep order of special tokens for backward compatibility self.fairseq_tokens_to_ids = {} cnt = 0 @@ -148,7 +140,13 @@ def __init__( self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + # Prepare sp_model_kwargs for parent class + if sp_model_kwargs is not None: + kwargs["sp_model_kwargs"] = sp_model_kwargs + + # Call parent init (which will load sp_model) super().__init__( + vocab_file=vocab_file, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, @@ -156,25 +154,9 @@ def __init__( cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, - sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - state["sp_model_proto"] = self.sp_model.serialized_model_proto() - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + self._align_added_tokens_with_fairseq_vocab() def build_inputs_with_special_tokens( self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None @@ -257,31 +239,55 @@ def create_token_type_ids_from_sequences( @property def vocab_size(self): + """Override to return fairseq vocab size instead of sp_model vocab size""" return len(self.fairseq_ids_to_tokens) def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) + """Override to use fairseq vocabulary""" + vocab = dict(self.fairseq_tokens_to_ids) + if hasattr(self, "_added_tokens_encoder"): + for token, idx in self._added_tokens_encoder.items(): + if token not in vocab: + vocab[token] = idx return vocab - def _tokenize(self, text: str) -> list[str]: - return self.sp_model.encode(text, out_type=str) - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" + """Converts a token (str) in an id using the fairseq vocab.""" if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] else: return self.unk_token_id + def _convert_token_to_id_with_added_voc(self, token): + """Override to use fairseq vocab instead of sp_model vocab.""" + if token is None: + return None + + if token in self._added_tokens_encoder: + return self._added_tokens_encoder[token] + return self._convert_token_to_id(token) + def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" + """Converts an index (integer) in a token (str) using the fairseq vocab.""" return self.fairseq_ids_to_tokens[index] - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() - return out_string + def _align_added_tokens_with_fairseq_vocab(self): + """ + The slow tokenizer base class populates `_added_tokens_*` using SentencePiece ids. Remap those entries so that + every token present in the reduced fairseq dictionary uses the same ids everywhere, otherwise conversions and + special-token setters observe two different vocabularies. + """ + if not hasattr(self, "_added_tokens_decoder") or not hasattr(self, "_added_tokens_encoder"): + return + + remapped_decoder: dict[int, AddedToken] = {} + for original_id, token_obj in self._added_tokens_decoder.items(): + token = token_obj.content + new_id = self.fairseq_tokens_to_ids.get(token, original_id) + remapped_decoder[new_id] = token_obj + + self._added_tokens_decoder = remapped_decoder + self._added_tokens_encoder = {token.content: idx for idx, token in remapped_decoder.items()} def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: if not os.path.isdir(save_directory): diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py index 04aeefe38a81..54a46312ad91 100644 --- a/src/transformers/models/bert/__init__.py +++ b/src/transformers/models/bert/__init__.py @@ -21,7 +21,6 @@ from .configuration_bert import * from .modeling_bert import * from .tokenization_bert import * - from .tokenization_bert_fast import * else: import sys diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 23cda58bfe72..b1edde12d5c2 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -15,17 +15,18 @@ """Tokenization classes for Bert.""" import collections -import os -import unicodedata from typing import Optional -from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import WordPiece + +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} def load_vocab(vocab_file): @@ -39,32 +40,18 @@ def load_vocab(vocab_file): return vocab -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(PreTrainedTokenizer): +class BertTokenizer(TokenizersBackend): r""" - Construct a BERT tokenizer. Based on WordPiece. + Construct a BERT tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece. - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): File containing the vocabulary. - do_lower_case (`bool`, *optional*, defaults to `True`): + do_lower_case (`bool`, *optional*, defaults to `False`): Whether or not to lowercase the input when tokenizing. - do_basic_tokenize (`bool`, *optional*, defaults to `True`): - Whether or not to do basic tokenization before WordPiece. - never_split (`Iterable`, *optional*): - Collection of tokens which will never be split during tokenization. Only has an effect when - `do_basic_tokenize=True` unk_token (`str`, *optional*, defaults to `"[UNK]"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. @@ -82,57 +69,64 @@ class BertTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): Whether or not to tokenize Chinese characters. - - This should likely be deactivated for Japanese (see this - [issue](https://github.com/huggingface/transformers/issues/328)). strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the value for `lowercase` (as in the original BERT). - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like - extra spaces. + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. """ vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "token_type_ids", "attention_mask"] + slow_tokenizer_class = None def __init__( self, - vocab_file, - do_lower_case=True, - do_basic_tokenize=True, - never_split=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - strip_accents=None, - clean_up_tokenization_spaces=True, + vocab_file: Optional[str] = None, + do_lower_case: bool = False, + unk_token: str = "[UNK]", + sep_token: str = "[SEP]", + pad_token: str = "[PAD]", + cls_token: str = "[CLS]", + mask_token: str = "[MASK]", + tokenize_chinese_chars: bool = True, + strip_accents: Optional[bool] = None, + vocab: Optional[dict] = None, **kwargs, ): - if not os.path.isfile(vocab_file): - raise ValueError( - f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" - " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" - ) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer( - do_lower_case=do_lower_case, - never_split=never_split, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, + self.do_lower_case = do_lower_case + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + if vocab is not None: + self._vocab = ( + {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab ) + else: + self._vocab = { + str(pad_token): 0, + str(unk_token): 1, + str(cls_token): 2, + str(sep_token): 3, + str(mask_token): 4, + } + + self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token))) + + self._tokenizer.normalizer = normalizers.BertNormalizer( + clean_text=True, + handle_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + lowercase=do_lower_case, + ) + self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() + self._tokenizer.decoder = decoders.WordPiece(prefix="##") - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + tokenizer_object = self._tokenizer super().__init__( + tokenizer_object=tokenizer_object, do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, @@ -140,339 +134,22 @@ def __init__( mask_token=mask_token, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) - @property - def do_lower_case(self): - return self.basic_tokenizer.do_lower_case - - @property - def vocab_size(self): - return len(self.vocab) - - def get_vocab(self): - return dict(self.vocab, **self.added_tokens_encoder) - - def _tokenize(self, text, split_special_tokens=False): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize( - text, never_split=self.all_special_tokens if not split_special_tokens else None - ): - # If the token is part of the never_split set - if token in self.basic_tokenizer.never_split: - split_tokens.append(token) - else: - split_tokens += self.wordpiece_tokenizer.tokenize(token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.vocab.get(token, self.vocab.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.ids_to_tokens.get(index, self.unk_token) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - out_string = " ".join(tokens).replace(" ##", "").strip() - return out_string - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - index = 0 - if os.path.isdir(save_directory): - vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - else: - vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory - with open(vocab_file, "w", encoding="utf-8") as writer: - for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!" - ) - index = token_index - writer.write(token + "\n") - index += 1 - return (vocab_file,) - - -class BasicTokenizer: - """ - Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). - - Args: - do_lower_case (`bool`, *optional*, defaults to `True`): - Whether or not to lowercase the input when tokenizing. - never_split (`Iterable`, *optional*): - Collection of tokens which will never be split during tokenization. Only has an effect when - `do_basic_tokenize=True` - tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): - Whether or not to tokenize Chinese characters. - - This should likely be deactivated for Japanese (see this - [issue](https://github.com/huggingface/transformers/issues/328)). - strip_accents (`bool`, *optional*): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for `lowercase` (as in the original BERT). - do_split_on_punc (`bool`, *optional*, defaults to `True`): - In some instances we want to skip the basic punctuation splitting so that later tokenization can capture - the full context of the words, such as contractions. - """ - - def __init__( - self, - do_lower_case=True, - never_split=None, - tokenize_chinese_chars=True, - strip_accents=None, - do_split_on_punc=True, - ): - if never_split is None: - never_split = [] - self.do_lower_case = do_lower_case - self.never_split = set(never_split) - self.tokenize_chinese_chars = tokenize_chinese_chars - self.strip_accents = strip_accents - self.do_split_on_punc = do_split_on_punc - - def tokenize(self, text, never_split=None): - """ - Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. - - Args: - never_split (`List[str]`, *optional*) - Kept for backward compatibility purposes. Now implemented directly at the base class level (see - [`PreTrainedTokenizer.tokenize`]) List of token not to split. - """ - # union() returns a new set by concatenating the two sets. - never_split = self.never_split.union(set(never_split)) if never_split else self.never_split - text = self._clean_text(text) + cls_token_id = self.cls_token_id if self.cls_token_id is not None else 2 + sep_token_id = self.sep_token_id if self.sep_token_id is not None else 3 - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - if self.tokenize_chinese_chars: - text = self._tokenize_chinese_chars(text) - # prevents treating the same character with different unicode codepoints as different characters - unicode_normalized_text = unicodedata.normalize("NFC", text) - orig_tokens = whitespace_tokenize(unicode_normalized_text) - split_tokens = [] - for token in orig_tokens: - if token not in never_split: - if self.do_lower_case: - token = token.lower() - if self.strip_accents is not False: - token = self._run_strip_accents(token) - elif self.strip_accents: - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token, never_split)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text, never_split=None): - """Splits punctuation on a piece of text.""" - if not self.do_split_on_punc or (never_split is not None and text in never_split): - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ( - (cp >= 0x4E00 and cp <= 0x9FFF) - or (cp >= 0x3400 and cp <= 0x4DBF) - or (cp >= 0x20000 and cp <= 0x2A6DF) - or (cp >= 0x2A700 and cp <= 0x2B73F) - or (cp >= 0x2B740 and cp <= 0x2B81F) - or (cp >= 0x2B820 and cp <= 0x2CEAF) - or (cp >= 0xF900 and cp <= 0xFAFF) - or (cp >= 0x2F800 and cp <= 0x2FA1F) - ): - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xFFFD or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer: - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token, max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """ - Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform - tokenization using the given vocabulary. - - For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`. - - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through *BasicTokenizer*. - - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end + self._tokenizer.post_processor = processors.TemplateProcessing( + single=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0", + pair=f"{str(self.cls_token)}:0 $A:0 {str(self.sep_token)}:0 $B:1 {str(self.sep_token)}:1", + special_tokens=[ + (str(self.cls_token), cls_token_id), + (str(self.sep_token), sep_token_id), + ], + ) - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens +__all__ = ["BertTokenizer"] -__all__ = ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"] +BertTokenizerFast = BertTokenizer diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py deleted file mode 100644 index 2cdc6129881b..000000000000 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ /dev/null @@ -1,146 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fast Tokenization classes for Bert.""" - -import json -from typing import Optional - -from tokenizers import normalizers - -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging -from .tokenization_bert import BertTokenizer - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} - - -class BertTokenizerFast(PreTrainedTokenizerFast): - r""" - Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - File containing the vocabulary. - do_lower_case (`bool`, *optional*, defaults to `True`): - Whether or not to lowercase the input when tokenizing. - unk_token (`str`, *optional*, defaults to `"[UNK]"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (`str`, *optional*, defaults to `"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - pad_token (`str`, *optional*, defaults to `"[PAD]"`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (`str`, *optional*, defaults to `"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - mask_token (`str`, *optional*, defaults to `"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - clean_text (`bool`, *optional*, defaults to `True`): - Whether or not to clean the text before tokenization by removing any control characters and replacing all - whitespaces by the classic one. - tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): - Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this - issue](https://github.com/huggingface/transformers/issues/328)). - strip_accents (`bool`, *optional*): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for `lowercase` (as in the original BERT). - wordpieces_prefix (`str`, *optional*, defaults to `"##"`): - The prefix for subwords. - """ - - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = BertTokenizer - - def __init__( - self, - vocab_file=None, - tokenizer_file=None, - do_lower_case=True, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs, - ): - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - do_lower_case=do_lower_case, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) - - normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) - if ( - normalizer_state.get("lowercase", do_lower_case) != do_lower_case - or normalizer_state.get("strip_accents", strip_accents) != strip_accents - or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars - ): - normalizer_class = getattr(normalizers, normalizer_state.pop("type")) - normalizer_state["lowercase"] = do_lower_case - normalizer_state["strip_accents"] = strip_accents - normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars - self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) - - self.do_lower_case = do_lower_case - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - - if token_ids_1 is not None: - output += token_ids_1 + [self.sep_token_id] - - return output - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - files = self._tokenizer.model.save(save_directory, name=filename_prefix) - return tuple(files) - - -__all__ = ["BertTokenizerFast"] diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/bert/tokenization_bert_legacy.py similarity index 95% rename from src/transformers/models/electra/tokenization_electra.py rename to src/transformers/models/bert/tokenization_bert_legacy.py index d8971dd6f403..ebb733fd2da1 100644 --- a/src/transformers/models/electra/tokenization_electra.py +++ b/src/transformers/models/bert/tokenization_bert_legacy.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,13 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Tokenization classes for Bert.""" import collections import os import unicodedata from typing import Optional -from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...tokenization_python import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...utils import logging @@ -27,7 +28,6 @@ VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} -# Copied from transformers.models.bert.tokenization_bert.load_vocab def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() @@ -39,7 +39,6 @@ def load_vocab(vocab_file): return vocab -# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() @@ -49,10 +48,9 @@ def whitespace_tokenize(text): return tokens -# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->Electra,BERT->Electra -class ElectraTokenizer(PreTrainedTokenizer): +class BertTokenizerLegacy(PreTrainedTokenizer): r""" - Construct a Electra tokenizer. Based on WordPiece. + Construct a BERT tokenizer. Based on WordPiece. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to this superclass for more information regarding those methods. @@ -89,7 +87,7 @@ class ElectraTokenizer(PreTrainedTokenizer): [issue](https://github.com/huggingface/transformers/issues/328)). strip_accents (`bool`, *optional*): Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for `lowercase` (as in the original Electra). + value for `lowercase` (as in the original BERT). clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra spaces. @@ -116,7 +114,7 @@ def __init__( if not os.path.isfile(vocab_file): raise ValueError( f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" - " model use `tokenizer = ElectraTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" ) self.vocab = load_vocab(vocab_file) self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) @@ -190,7 +188,7 @@ def build_inputs_with_special_tokens( ) -> list[int]: """ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Electra sequence has the following format: + adding special tokens. A BERT sequence has the following format: - single sequence: `[CLS] X [SEP]` - pair of sequences: `[CLS] A [SEP] B [SEP]` @@ -259,7 +257,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = return (vocab_file,) -# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -421,7 +418,6 @@ def _clean_text(self, text): return "".join(output) -# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer class WordpieceTokenizer: """Runs WordPiece tokenization.""" @@ -479,4 +475,4 @@ def tokenize(self, text): return output_tokens -__all__ = ["ElectraTokenizer"] +__all__ = ["BasicTokenizer", "BertTokenizerLegacy", "WordpieceTokenizer"] diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index baeba9e4b388..eb04da9761ee 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -14,13 +14,9 @@ # limitations under the License. """Tokenization class for model BertGeneration.""" -import os -from shutil import copyfile from typing import Any, Optional -import sentencepiece as spm - -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils_sentencepiece import SentencePieceBackend from ...utils import logging from ...utils.import_utils import requires @@ -31,7 +27,7 @@ @requires(backends=("sentencepiece",)) -class BertGenerationTokenizer(PreTrainedTokenizer): +class BertGenerationTokenizer(SentencePieceBackend): """ Construct a BertGeneration tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). @@ -75,6 +71,7 @@ class BertGenerationTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES prefix_tokens: list[int] = [] model_input_names = ["input_ids", "attention_mask"] + is_fast = False def __init__( self, @@ -89,89 +86,18 @@ def __init__( ) -> None: self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - self.vocab_file = vocab_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - - # Add extra_ids to the special token list + # Call parent init (which will load sp_model) super().__init__( + vocab_file=vocab_file, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, + special_tokens_pattern="none", **kwargs, ) - @property - def vocab_size(self): - return self.sp_model.get_piece_size() - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def _tokenize(self, text: str) -> list[str]: - """Take as input a string and return a list of strings (tokens) for words/sub-words""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - out_string += self.sp_model.decode(current_sub_tokens) + token - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - __all__ = ["BertGenerationTokenizer"] diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index cacacd87574a..8c816afbbcc1 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -20,7 +20,7 @@ import unicodedata from typing import Any, Optional -from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...tokenization_python import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging @@ -36,7 +36,6 @@ SPIECE_UNDERLINE = "▁" -# Copied from transformers.models.bert.tokenization_bert.load_vocab def load_vocab(vocab_file): """Loads a vocabulary file into a dictionary.""" vocab = collections.OrderedDict() @@ -48,7 +47,6 @@ def load_vocab(vocab_file): return vocab -# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() @@ -181,6 +179,9 @@ def __init__( mecab_kwargs=mecab_kwargs, sudachi_kwargs=sudachi_kwargs, jumanpp_kwargs=jumanpp_kwargs, + token_type_ids_pattern="bert_style", + token_type_ids_include_special_tokens=True, + special_tokens_pattern="cls_sep", **kwargs, ) @@ -233,7 +234,13 @@ def get_vocab(self): vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab - return dict(self.vocab, **self.added_tokens_encoder) + # base vocab + vocab = dict(self.vocab) + # + added_tokens_encoder (only for tokens not in base vocab) + for token, index in self.added_tokens_encoder.items(): + if token not in self.vocab: + vocab[token] = index + return vocab def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" @@ -254,61 +261,6 @@ def convert_tokens_to_string(self, tokens): out_string = " ".join(tokens).replace(" ##", "").strip() return out_string - # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep - - # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: if os.path.isdir(save_directory): if self.subword_tokenizer_type == "sentencepiece": @@ -660,7 +612,6 @@ def tokenize(self, text): return output_tokens -# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer class BasicTokenizer: """ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). @@ -822,7 +773,6 @@ def _clean_text(self, text): return "".join(output) -# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer class WordpieceTokenizer: """Runs WordPiece tokenization.""" diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index 3ce1a3182bf9..821b06c56b93 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -18,12 +18,11 @@ import html import os import re -from shutil import copyfile from typing import Optional import regex -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_python import PreTrainedTokenizer from ...utils import logging @@ -161,87 +160,13 @@ def __init__( unk_token=unk_token, pad_token=pad_token, mask_token=mask_token, + # Configure patterns instead of overriding methods + token_type_ids_pattern="all_zeros", # BERTweet doesn't use token type IDs + token_type_ids_include_special_tokens=True, + special_tokens_pattern="cls_double_sep", # X Y **kwargs, ) - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BERTweet sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does - not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - @property def vocab_size(self): return len(self.encoder) @@ -370,35 +295,42 @@ def convert_tokens_to_string(self, tokens): out_string = " ".join(tokens).replace("@@ ", "").strip() return out_string - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - out_merge_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file): - copyfile(self.merges_file, out_merge_file) - - return out_vocab_file, out_merge_file - # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) # return ''.join(tokens_generated_so_far) + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str, ...]: + """ + Save the vocabulary and merges files to a directory. + """ + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return () + + vocab_files_names = getattr(self, "vocab_files_names", {}) + prefix = f"{filename_prefix}-" if filename_prefix else "" + + # Save vocabulary in the format expected by add_from_file: + # Exclude special tokens (IDs 0-3) as they are added in __init__ before add_from_file + vocab_file = os.path.join(save_directory, prefix + vocab_files_names.get("vocab_file", "vocab.txt")) + with open(vocab_file, "w", encoding="utf-8") as f: + for token, token_id in sorted(self.encoder.items(), key=lambda kv: kv[1]): + # Only save tokens with ID >= 4, as IDs 0-3 are reserved for special tokens + if token_id >= 4: + f.write(f"{token} {token_id}\n") + + # Save BPE merges + merge_file = os.path.join(save_directory, prefix + vocab_files_names.get("merges_file", "bpe.codes")) + with open(merge_file, "w", encoding="utf-8") as writer: + writer.writelines( + " ".join(bpe_tokens) + "\n" + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]) + ) + + return (vocab_file, merge_file) + def add_from_file(self, f): """ Loads a pre-existing dictionary from a text file and adds its symbols to this instance. @@ -622,7 +554,6 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8") remove_illegal (bool): If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are kept "as is". - Returns: A unicode string with the entities removed. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py diff --git a/src/transformers/models/big_bird/__init__.py b/src/transformers/models/big_bird/__init__.py index e9bc0f08af3e..52bcbffcff97 100644 --- a/src/transformers/models/big_bird/__init__.py +++ b/src/transformers/models/big_bird/__init__.py @@ -21,7 +21,6 @@ from .configuration_big_bird import * from .modeling_big_bird import * from .tokenization_big_bird import * - from .tokenization_big_bird_fast import * else: import sys diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index 56680972a63e..37bc4d6ea013 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,44 +12,49 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization classes for BigBird.""" +"""Tokenization classes for Big Bird model.""" -import os -import re -from shutil import copyfile -from typing import Any, Optional +from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import Unigram -import sentencepiece as spm - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging -from ...utils.import_utils import requires logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} -VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} +SPIECE_UNDERLINE = "▁" -@requires(backends=("sentencepiece",)) -class BigBirdTokenizer(PreTrainedTokenizer): - """ - Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. +class BigBirdTokenizer(TokenizersBackend): + """ + Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. unk_token (`str`, *optional*, defaults to `""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (`str`, *optional*, defaults to `""`): - The begin of sequence token. + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. pad_token (`str`, *optional*, defaults to `""`): The token used for padding, for example when batching sequences of different lengths. sep_token (`str`, *optional*, defaults to `"[SEP]"`): @@ -62,21 +67,14 @@ class BigBirdTokenizer(PreTrainedTokenizer): cls_token (`str`, *optional*, defaults to `"[CLS]"`): The classifier token which is used when doing sequence classification (classification of the whole sequence instead of per-token classification). It is the first token of the sequence when built with special tokens. - sp_model_kwargs (`dict`, *optional*): - Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for - SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, - to set: - - - `enable_sampling`: Enable subword regularization. - - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - - - `nbest_size = {0,1}`: No sampling is performed. - - `nbest_size > 1`: samples from the nbest_size results. - - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - using forward-filtering-and-backward-sampling algorithm. - - - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - BPE-dropout. + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. + vocab_file (`str`, *optional*): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + tokenizer_file (`str`, *optional*): + Path to a tokenizers JSON file containing the serialization of a tokenizer. """ vocab_files_names = VOCAB_FILES_NAMES @@ -85,7 +83,7 @@ class BigBirdTokenizer(PreTrainedTokenizer): def __init__( self, - vocab_file, + vocab=None, unk_token="", bos_token="", eos_token="", @@ -93,211 +91,88 @@ def __init__( sep_token="[SEP]", mask_token="[MASK]", cls_token="[CLS]", - sp_model_kwargs: Optional[dict[str, Any]] = None, + add_prefix_space=True, + vocab_file=None, + tokenizer_file=None, **kwargs, - ) -> None: + ): bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token - - # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - + self.add_prefix_space = add_prefix_space self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) + # Convert vocab to list of (token, score) tuples + if vocab is None: + vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0)] + elif isinstance(vocab, dict): + vocab_scores = [(str(token), float(score)) for token, score in vocab.items()] + elif isinstance(vocab, list) and len(vocab) > 0: + if isinstance(vocab[0], (tuple, list)): + vocab_scores = [(str(token), float(score)) for token, score in vocab] + else: + vocab_scores = [(str(token), 0.0) for token in vocab] + else: + vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(bos_token), 0.0)] + + # Find unk_id in vocab + unk_token_content = str(unk_token) + unk_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == unk_token_content), None) + if unk_id is None: + unk_id = min(len(vocab_scores), 100) + if len(vocab_scores) > 100: + vocab_scores.insert(100, (unk_token_content, 0.0)) + else: + vocab_scores.append((unk_token_content, 0.0)) + + # Ensure cls_token and sep_token are in vocab + cls_token_str = str(cls_token) + sep_token_str = str(sep_token) + cls_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == cls_token_str), None) + sep_token_id = next((idx for idx, (token, _) in enumerate(vocab_scores) if token == sep_token_str), None) + + if cls_token_id is None: + cls_token_id = len(vocab_scores) + vocab_scores.append((cls_token_str, 0.0)) + if sep_token_id is None: + sep_token_id = len(vocab_scores) + vocab_scores.append((sep_token_str, 0.0)) + + self._tokenizer = Tokenizer(Unigram(vocab_scores, unk_id=unk_id, byte_fallback=False)) + self._tokenizer.normalizer = normalizers.Sequence( + [normalizers.Strip(left=False, right=True), normalizers.Replace(Regex(r" {2,}"), SPIECE_UNDERLINE)] + ) + + prepend_scheme = "always" if add_prefix_space else "never" + self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( + replacement="▁", prepend_scheme=prepend_scheme, split=True + ) + self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme, split=True) super().__init__( + tokenizer_object=self._tokenizer, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, - sep_token=sep_token, mask_token=mask_token, cls_token=cls_token, - sp_model_kwargs=self.sp_model_kwargs, + sep_token=sep_token, **kwargs, ) - @property - def vocab_size(self): - return self.sp_model.get_piece_size() - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} + self.init_kwargs["add_prefix_space"] = add_prefix_space - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def _tokenize(self, text: str) -> list[str]: - """Take as input a string and return a list of strings (tokens) for words/sub-words""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def _decode( - self, - token_ids: list[int], - skip_special_tokens: bool = False, - clean_up_tokenization_spaces: Optional[bool] = None, - spaces_between_special_tokens: bool = True, - **kwargs, - ) -> str: - self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) - - filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) - - # To avoid mixing byte-level and unicode for byte-level BPT - # we need to build string separately for added tokens and byte-level tokens - # cf. https://github.com/huggingface/transformers/issues/1133 - sub_texts = [] - current_sub_text = [] - for token in filtered_tokens: - if skip_special_tokens and token in self.all_special_ids: - continue - if token in self.added_tokens_encoder: - if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - current_sub_text = [] - sub_texts.append(token) - else: - current_sub_text.append(token) - if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - - # Mimic the behavior of the Rust tokenizer: - # No space before [MASK] and [SEP] - if spaces_between_special_tokens: - text = re.sub(r" (\[(MASK|SEP)\])", r"\1", " ".join(sub_texts)) - else: - text = "".join(sub_texts) - - clean_up_tokenization_spaces = ( - clean_up_tokenization_spaces - if clean_up_tokenization_spaces is not None - else self.clean_up_tokenization_spaces + self._tokenizer.post_processor = processors.TemplateProcessing( + single=f"{cls_token_str}:0 $A:0 {sep_token_str}:0", + pair=f"{cls_token_str}:0 $A:0 {sep_token_str}:0 $B:1 {sep_token_str}:1", + special_tokens=[(cls_token_str, cls_token_id), (sep_token_str, sep_token_id)], ) - if clean_up_tokenization_spaces: - clean_text = self.clean_up_tokenization(text) - return clean_text - else: - return text - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Big Bird sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] __all__ = ["BigBirdTokenizer"] diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py deleted file mode 100644 index 6148585a40b1..000000000000 --- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py +++ /dev/null @@ -1,198 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for Big Bird model.""" - -import os -from shutil import copyfile -from typing import Optional - -from ...tokenization_utils import AddedToken -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import is_sentencepiece_available, logging - - -if is_sentencepiece_available(): - from .tokenization_big_bird import BigBirdTokenizer -else: - BigBirdTokenizer = None - -logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} - - -SPIECE_UNDERLINE = "▁" - - -class BigBirdTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on - [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This - tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods - - Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token - that is used for the end of sequence. The token used is the `sep_token`. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - sep_token (`str`, *optional*, defaults to `"[SEP]"`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - cls_token (`str`, *optional*, defaults to `"[CLS]"`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - mask_token (`str`, *optional*, defaults to `"[MASK]"`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - """ - - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = BigBirdTokenizer - model_input_names = ["input_ids", "attention_mask"] - prefix_tokens: list[int] = [] - - def __init__( - self, - vocab_file=None, - tokenizer_file=None, - unk_token="", - bos_token="", - eos_token="", - pad_token="", - sep_token="[SEP]", - mask_token="[MASK]", - cls_token="[CLS]", - **kwargs, - ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token - - # Mask token behave like a normal word, i.e. include the space before it - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - **kwargs, - ) - - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An BigBird sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of ids. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Set to True if the token list is already formatted with special tokens for the model - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " - "tokenizer." - ) - - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -__all__ = ["BigBirdTokenizerFast"] diff --git a/src/transformers/models/biogpt/tokenization_biogpt.py b/src/transformers/models/biogpt/tokenization_biogpt.py index f84403ca7ddc..28108dba6624 100644 --- a/src/transformers/models/biogpt/tokenization_biogpt.py +++ b/src/transformers/models/biogpt/tokenization_biogpt.py @@ -18,7 +18,7 @@ import os from typing import Optional -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_python import PreTrainedTokenizer from ...utils import logging diff --git a/src/transformers/models/blenderbot/__init__.py b/src/transformers/models/blenderbot/__init__.py index 6e728fd0914a..69beb03a4583 100644 --- a/src/transformers/models/blenderbot/__init__.py +++ b/src/transformers/models/blenderbot/__init__.py @@ -21,7 +21,6 @@ from .configuration_blenderbot import * from .modeling_blenderbot import * from .tokenization_blenderbot import * - from .tokenization_blenderbot_fast import * else: import sys diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index 76719fa25494..ddec6a48872a 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -14,14 +14,11 @@ # limitations under the License. """Tokenization class for Blenderbot.""" -import json -import os -from functools import lru_cache -from typing import Optional +from tokenizers import Tokenizer, decoders, pre_tokenizers, processors +from tokenizers.models import BPE -import regex as re - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_utils_base import AddedToken +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging @@ -35,61 +32,20 @@ } -@lru_cache -# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control - characters the bpe code barfs on. - - The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab - if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for - decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup - tables between utf-8 bytes and unicode strings. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -# Copied from transformers.models.roberta.tokenization_roberta.get_pairs -def get_pairs(word): +class BlenderbotTokenizer(TokenizersBackend): """ - Return set of symbol pairs in a word. - - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - return pairs - - -class BlenderbotTokenizer(PreTrainedTokenizer): - """ - Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. + Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 + tokenizer, using byte-level Byte-Pair-Encoding. This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will be encoded differently whether it is at the beginning of the sentence (without space) or not: ```python - >>> from transformers import BlenderbotTokenizer + >>> from transformers import BlenderbotTokenizerFast - >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") - >>> tokenizer.add_prefix_space = False + >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") >>> tokenizer("Hello world")["input_ids"] - [47, 921, 86, 1085, 2] + [6950, 1085, 2] >>> tokenizer(" Hello world")["input_ids"] [6950, 1085, 2] @@ -100,21 +56,14 @@ class BlenderbotTokenizer(PreTrainedTokenizer): - When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. bos_token (`str`, *optional*, defaults to `""`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. @@ -150,20 +99,20 @@ class BlenderbotTokenizer(PreTrainedTokenizer): mask_token (`str`, *optional*, defaults to `""`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - add_prefix_space (`bool`, *optional*, defaults to `False`): + add_prefix_space (`bool`, *optional*, defaults to `True`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (Blenderbot tokenizer detect beginning of words by the preceding space). + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. + merges (`list`, *optional*): + Custom merges list. If not provided, merges are loaded from merges_file. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot def __init__( self, - vocab_file, - merges_file, - errors="replace", bos_token="", eos_token="", sep_token="", @@ -171,240 +120,68 @@ def __init__( unk_token="", pad_token="", mask_token="", - add_prefix_space=False, + add_prefix_space=True, + vocab=None, + merges=None, **kwargs, ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token - cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token - - # Mask token behave like a normal word, i.e. include the space before it + self.add_prefix_space = add_prefix_space mask_token = ( AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) if isinstance(mask_token, str) else mask_token ) - # these special tokens are not part of the vocab.json, let's add them in the correct order - - with open(vocab_file, encoding="utf-8") as vocab_handle: - self.encoder = json.load(vocab_handle) - self.decoder = {v: k for k, v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with open(merges_file, encoding="utf-8") as merges_handle: - bpe_merges = merges_handle.read().split("\n")[1:-1] - bpe_merges = [tuple(merge.split()) for merge in bpe_merges] - self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - self.cache = {} - self.add_prefix_space = add_prefix_space + if vocab is not None and merges is not None: + self._vocab = ( + {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab + ) + self._merges = merges + else: + # Initialize with minimal vocab + self._vocab = { + str(bos_token): 0, + str(pad_token): 1, + str(eos_token): 2, + str(unk_token): 3, + str(mask_token): 4, + } + self._merges = [] + + self._tokenizer = Tokenizer( + BPE( + vocab=self._vocab, + merges=self._merges, + dropout=None, + continuing_subword_prefix="", + end_of_word_suffix="", + fuse_unk=False, + ) + ) - # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions - self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space) + self._tokenizer.decoder = decoders.ByteLevel() + self._tokenizer.post_processor = processors.RobertaProcessing( + sep=(str(eos_token), self._vocab.get(str(eos_token), 2)), + cls=(str(bos_token), self._vocab.get(str(bos_token), 0)), + add_prefix_space=add_prefix_space, + trim_offsets=True, + ) + + tokenizer_object = self._tokenizer super().__init__( - errors=errors, + tokenizer_object=tokenizer_object, bos_token=bos_token, eos_token=eos_token, - unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, + unk_token=unk_token, pad_token=pad_token, mask_token=mask_token, add_prefix_space=add_prefix_space, **kwargs, ) - @property - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot - def vocab_size(self): - return len(self.encoder) - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot - def get_vocab(self): - vocab = dict(self.encoder).copy() - vocab.update(self.added_tokens_encoder) - return vocab - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token) - pairs = get_pairs(word) - - if not pairs: - return token - - while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - except ValueError: - new_word.extend(word[i:]) - break - else: - new_word.extend(word[i:j]) - i = j - - if word[i] == first and i < len(word) - 1 and word[i + 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = " ".join(word) - self.cache[token] = word - return word - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize with Roberta->Blenderbot, RoBERTa->Blenderbot - def _tokenize(self, text): - """Tokenize a string.""" - bpe_tokens = [] - for token in re.findall(self.pat, text): - token = "".join( - self.byte_encoder[b] for b in token.encode("utf-8") - ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) - bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) - return bpe_tokens - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id with Roberta->Blenderbot, RoBERTa->Blenderbot - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.encoder.get(token, self.encoder.get(self.unk_token)) - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token with Roberta->Blenderbot, RoBERTa->Blenderbot - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.decoder.get(index) - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string with Roberta->Blenderbot, RoBERTa->Blenderbot - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - text = "".join(tokens) - text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) - return text - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - merge_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] - ) - - with open(vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") - - index = 0 - with open(merge_file, "w", encoding="utf-8") as writer: - writer.write("#version: 0.2\n") - for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!" - ) - index = token_index - writer.write(" ".join(bpe_tokens) + "\n") - index += 1 - - return vocab_file, merge_file - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Blenderbot, RoBERTa->Blenderbot - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not - make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization with Roberta->Blenderbot, RoBERTa->Blenderbot - def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): - add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) - if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): - text = " " + text - return (text, kwargs) - - def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Blenderbot sequence has the following format: - - single sequence: ` X ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added - token_ids_1 (`list[int]`, *optional*): - Will be ignored - Returns: - `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - return token_ids_0 + [self.eos_token_id] - __all__ = ["BlenderbotTokenizer"] diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py deleted file mode 100644 index 0b84200e02d5..000000000000 --- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py +++ /dev/null @@ -1,284 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fast Tokenization class for Blenderbot.""" - -import json -from typing import Optional - -from tokenizers import processors - -from ...tokenization_utils_base import AddedToken, BatchEncoding -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging -from .tokenization_blenderbot import BlenderbotTokenizer - - -logger = logging.get_logger(__name__) - - -VOCAB_FILES_NAMES = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", - "tokenizer_config_file": "tokenizer_config.json", -} - - -class BlenderbotTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 - tokenizer, using byte-level Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - ```python - >>> from transformers import BlenderbotTokenizerFast - - >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") - >>> tokenizer("Hello world")["input_ids"] - [6950, 1085, 2] - - >>> tokenizer(" Hello world")["input_ids"] - [6950, 1085, 2] - ``` - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you - call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. - - - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (Blenderbot tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether the post processing step should trim offsets to avoid including whitespaces. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = BlenderbotTokenizer - - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot - def __init__( - self, - vocab_file=None, - merges_file=None, - tokenizer_file=None, - errors="replace", - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - add_prefix_space=False, - trim_offsets=True, - **kwargs, - ): - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) - if isinstance(mask_token, str) - else mask_token - ) - super().__init__( - vocab_file, - merges_file, - tokenizer_file=tokenizer_file, - errors=errors, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - add_prefix_space=add_prefix_space, - trim_offsets=trim_offsets, - **kwargs, - ) - - tokenizer_component = "post_processor" - tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) - if tokenizer_component_instance: - state = json.loads(tokenizer_component_instance.__getstate__()) - - # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` - if "sep" in state: - state["sep"] = tuple(state["sep"]) - if "cls" in state: - state["cls"] = tuple(state["cls"]) - - changes_to_apply = False - - if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: - state["add_prefix_space"] = add_prefix_space - changes_to_apply = True - - if state.get("trim_offsets", trim_offsets) != trim_offsets: - state["trim_offsets"] = trim_offsets - changes_to_apply = True - - if changes_to_apply: - component_class = getattr(processors, state.pop("type")) - new_value = component_class(**state) - setattr(self.backend_tokenizer, tokenizer_component, new_value) - - @property - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.mask_token with Roberta->Blenderbot, RoBERTa->Blenderbot - def mask_token(self) -> str: - """ - `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not - having been set. - - Blenderbot tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily - comprise the space before the **. - """ - if self._mask_token is None: - if self.verbose: - logger.error("Using mask_token, but it is not set yet.") - return None - return str(self._mask_token) - - @mask_token.setter - def mask_token(self, value): - """ - Overriding the default behavior of the mask token to have it eat the space before it. - - This is needed to preserve backward compatibility with all the previously used models based on Roberta. - """ - # Mask token behave like a normal word, i.e. include the space before it - # So we set lstrip to True - value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value - self._mask_token = value - - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._batch_encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot - def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._batch_encode_plus(*args, **kwargs) - - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot - def _encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - - assert self.add_prefix_space or not is_split_into_words, ( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " - "to use it with pretokenized inputs." - ) - - return super()._encode_plus(*args, **kwargs) - - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - files = self._tokenizer.model.save(save_directory, name=filename_prefix) - return tuple(files) - - # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not - make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A Blenderbot sequence has the following format: - - single sequence: ` X ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added - token_ids_1 (`list[int]`, *optional*): - Will be ignored - Returns: - `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - return token_ids_0 + [self.eos_token_id] - - -__all__ = ["BlenderbotTokenizerFast"] diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py index adb54025ce23..c075e7727e43 100644 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -20,7 +20,7 @@ import regex as re -from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_python import PreTrainedTokenizer from ...utils import logging @@ -96,7 +96,9 @@ def __init__( merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {} + super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) + self.special_tokens_pattern = None @property def vocab_size(self) -> int: diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py deleted file mode 100644 index 7d905dbbc5b2..000000000000 --- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py +++ /dev/null @@ -1,103 +0,0 @@ -# coding=utf-8 -# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fast tokenization class for BlenderbotSmall.""" - -from typing import Optional - -from tokenizers import ByteLevelBPETokenizer - -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging -from .tokenization_blenderbot_small import BlenderbotSmallTokenizer - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", - "tokenizer_config_file": "tokenizer_config.json", -} - - -class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library). - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - slow_tokenizer_class = BlenderbotSmallTokenizer - - def __init__( - self, - vocab_file=None, - merges_file=None, - unk_token="<|endoftext|>", - bos_token="<|endoftext|>", - eos_token="<|endoftext|>", - add_prefix_space=False, - trim_offsets=True, - **kwargs, - ): - super().__init__( - ByteLevelBPETokenizer( - vocab=vocab_file, - merges=merges_file, - add_prefix_space=add_prefix_space, - trim_offsets=trim_offsets, - ), - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - **kwargs, - ) - self.add_prefix_space = add_prefix_space - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] - if token_ids_1 is None: - return output - - return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall - does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - -__all__ = ["BlenderbotSmallTokenizerFast"] diff --git a/src/transformers/models/bloom/__init__.py b/src/transformers/models/bloom/__init__.py index 4a938fd80b25..ba39d13cedcb 100644 --- a/src/transformers/models/bloom/__init__.py +++ b/src/transformers/models/bloom/__init__.py @@ -20,7 +20,7 @@ if TYPE_CHECKING: from .configuration_bloom import * from .modeling_bloom import * - from .tokenization_bloom_fast import * + from .tokenization_bloom import * else: import sys diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py deleted file mode 100644 index 04ed9d701da4..000000000000 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ /dev/null @@ -1,146 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for Bloom.""" - -from typing import Optional - -from ...tokenization_utils_base import BatchEncoding -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} - - -class BloomTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level - Byte-Pair-Encoding. - - This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will - be encoded differently whether it is at the beginning of the sentence (without space) or not: - - ```python - >>> from transformers import BloomTokenizerFast - - >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom") - >>> tokenizer("Hello world")["input_ids"] - [59414, 8876] - - >>> tokenizer(" Hello world")["input_ids"] - [86153, 8876] - ``` - - You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since - the model was not pretrained this way, it might yield a decrease in performance. - - - - When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. - - - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - bos_token (`str`, *optional*, defaults to `<|endoftext|>`): - The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): - The end of sequence token. - add_prefix_space (`bool`, *optional*, defaults to `False`): - Whether or not to add an initial space to the input. This allows to treat the leading word just as any - other word. (Bloom tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether or not the post-processing step should trim offsets to avoid including whitespaces. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = None - # No `max_model_input_sizes` as BLOOM uses ALiBi positional embeddings - - def __init__( - self, - vocab_file=None, - merges_file=None, - tokenizer_file=None, - unk_token="", - bos_token="", - eos_token="", - pad_token="", - add_prefix_space=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - super().__init__( - vocab_file=vocab_file, - merges_file=merges_file, - tokenizer_file=tokenizer_file, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - add_prefix_space=add_prefix_space, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - # This is a `tokenizers.pre_tokenizers.Sequence` - for pre_tokenizer in self.backend_tokenizer.pre_tokenizer: - if hasattr(pre_tokenizer, "add_prefix_space"): - pre_tokenizer.add_prefix_space = add_prefix_space - self.backend_tokenizer.decoder.add_prefix_space = add_prefix_space - - self.add_prefix_space = add_prefix_space - - def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - if not (self.add_prefix_space or not is_split_into_words): - raise Exception( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" - " pretokenized inputs." - ) - - return super()._batch_encode_plus(*args, **kwargs) - - def _encode_plus(self, *args, **kwargs) -> BatchEncoding: - is_split_into_words = kwargs.get("is_split_into_words", False) - - if not (self.add_prefix_space or not is_split_into_words): - raise Exception( - f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" - " pretokenized inputs." - ) - - return super()._encode_plus(*args, **kwargs) - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - files = self._tokenizer.model.save(save_directory, name=filename_prefix) - return tuple(files) - - -__all__ = ["BloomTokenizerFast"] diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py index 2a9804db1014..68a187b22fb5 100644 --- a/src/transformers/models/byt5/tokenization_byt5.py +++ b/src/transformers/models/byt5/tokenization_byt5.py @@ -17,7 +17,7 @@ import warnings from typing import Optional -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken, PreTrainedTokenizer from ...utils import logging diff --git a/src/transformers/models/camembert/__init__.py b/src/transformers/models/camembert/__init__.py index a3a9c395eb5b..070e8311ae6f 100644 --- a/src/transformers/models/camembert/__init__.py +++ b/src/transformers/models/camembert/__init__.py @@ -21,7 +21,6 @@ from .configuration_camembert import * from .modeling_camembert import * from .tokenization_camembert import * - from .tokenization_camembert_fast import * else: import sys diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index cd6e399f208d..fb0a2af1b7f7 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -14,38 +14,32 @@ # limitations under the License """Tokenization classes for Camembert model.""" -import os -from shutil import copyfile -from typing import Any, Optional +from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import Unigram -import sentencepiece as spm - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging -from ...utils.import_utils import requires logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} SPIECE_UNDERLINE = "▁" -@requires(backends=("sentencepiece",)) -class CamembertTokenizer(PreTrainedTokenizer): +class CamembertTokenizer(TokenizersBackend): """ - Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on - [SentencePiece](https://github.com/google/sentencepiece). + Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. bos_token (`str`, *optional*, defaults to `""`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. @@ -81,35 +75,24 @@ class CamembertTokenizer(PreTrainedTokenizer): mask_token (`str`, *optional*, defaults to `""`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - additional_special_tokens (`list[str]`, *optional*, defaults to `['NOTUSED', 'NOTUSED', 'NOTUSED']`): + additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - sp_model_kwargs (`dict`, *optional*): - Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for - SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, - to set: - - - `enable_sampling`: Enable subword regularization. - - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. - - - `nbest_size = {0,1}`: No sampling is performed. - - `nbest_size > 1`: samples from the nbest_size results. - - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) - using forward-filtering-and-backward-sampling algorithm. - - - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for - BPE-dropout. - - Attributes: - sp_model (`SentencePieceProcessor`): - The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + add_prefix_space (`bool`, *optional*, defaults to `True`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. + vocab_file (`str`, *optional*): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + vocab (`dict`, *optional*): + Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None def __init__( self, - vocab_file, bos_token="", eos_token="", sep_token="", @@ -117,207 +100,74 @@ def __init__( unk_token="", pad_token="", mask_token="", - additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], - sp_model_kwargs: Optional[dict[str, Any]] = None, + additional_special_tokens=None, + add_prefix_space=True, + vocab_file=None, + vocab=None, **kwargs, - ) -> None: - # Mask token behave like a normal word, i.e. include the space before it - mask_token = ( - AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True) - if isinstance(mask_token, str) - else mask_token - ) - - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(str(vocab_file)) + ): self.vocab_file = vocab_file + self.add_prefix_space = add_prefix_space + + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + + if additional_special_tokens is None: + additional_special_tokens = ["NOTUSED", "NOTUSED", "NOTUSED"] + + if vocab is not None and isinstance(vocab, list): + self._vocab = list(vocab) + unk_index = next(i for i, (tok, _) in enumerate(self._vocab) if tok == str(unk_token)) + self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=unk_index, byte_fallback=False)) + else: + self._vocab = [ + ("NOTUSED", 0.0), + (str(pad_token), 0.0), + ("NOTUSED", 0.0), + (str(unk_token), 0.0), + ("NOTUSED", -100), + (str(mask_token), 0.0), + ] + self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=3, byte_fallback=False)) + + self._tokenizer.normalizer = normalizers.Sequence( + [ + normalizers.Replace("\n", " "), + normalizers.Replace("\r", " "), + normalizers.Replace("\t", " "), + normalizers.Strip(left=False, right=True), + normalizers.Replace(Regex(" {2,}"), "▁"), + ] + ) - # HACK: These tokens were added by the author for an obscure reason as they were already part of the - # sentencepiece vocabulary (this is the case for and and ). - # In this case it is recommended to properly set the tokens by hand. - self._added_tokens_decoder = { - 0: AddedToken("NOTUSED", special=True), - 1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token, - 2: AddedToken("NOTUSED", special=True), - 3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token, - 4: AddedToken("NOTUSED", special=True), - } - - self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 + prepend_scheme = "always" if add_prefix_space else "never" + self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme) + self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme) - # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here - if "added_tokens_decoder" in kwargs: - # this is the only class that requires this unfortunately..... - # the reason is that the fast version has a whole. - kwargs["added_tokens_decoder"].update(self._added_tokens_decoder) + tokenizer_object = self._tokenizer super().__init__( + tokenizer_object=tokenizer_object, bos_token=bos_token, eos_token=eos_token, - unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, + unk_token=unk_token, pad_token=pad_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, - sp_model_kwargs=self.sp_model_kwargs, + add_prefix_space=add_prefix_space, **kwargs, ) - @property - def vocab_size(self): - # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning. - return len(self.sp_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text: str) -> list[str]: - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - # specific to camembert, both 3 and 4 point to the unk token. - if self.sp_model.PieceToId(token) == 0: - # Convert sentence piece unk token to fairseq unk token index - return self.unk_token_id - return self.fairseq_offset + self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - # TODO decode outputs do not match between fast and slow - current_sub_tokens = [] - out_string = "" - prev_is_special = False - for token in tokens: - # make sure that special tokens are not decoded using sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string.strip() - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + # always adds BOS/EOS with " " separator for pairs + self._tokenizer.post_processor = processors.TemplateProcessing( + single=f"{self.bos_token} $A {self.eos_token}", + pair=f"{self.bos_token} $A {self.eos_token} {self.eos_token} $B {self.eos_token}", + special_tokens=[ + (self.bos_token, self.bos_token_id), + (self.eos_token, self.eos_token_id), + ], ) - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file,) - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An CamemBERT sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like - RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - __all__ = ["CamembertTokenizer"] diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py deleted file mode 100644 index 423058ed959a..000000000000 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ /dev/null @@ -1,197 +0,0 @@ -# coding=utf-8 -# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -"""Fast tokenization classes for Camembert model.""" - -import os -from shutil import copyfile -from typing import Optional - -from ...tokenization_utils import AddedToken -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import is_sentencepiece_available, logging - - -if is_sentencepiece_available(): - from .tokenization_camembert import CamembertTokenizer -else: - CamembertTokenizer = None - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} - - -SPIECE_UNDERLINE = "▁" - - -class CamembertTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from - [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on - [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that - contains the vocabulary necessary to instantiate a tokenizer. - bos_token (`str`, *optional*, defaults to `""`): - The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - - - - When building a sequence using special tokens, this is not the token that is used for the beginning of - sequence. The token used is the `cls_token`. - - - - eos_token (`str`, *optional*, defaults to `""`): - The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - - sep_token (`str`, *optional*, defaults to `""`): - The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for - sequence classification or for a text and a question for question answering. It is also used as the last - token of a sequence built with special tokens. - cls_token (`str`, *optional*, defaults to `""`): - The classifier token which is used when doing sequence classification (classification of the whole sequence - instead of per-token classification). It is the first token of the sequence when built with special tokens. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. - mask_token (`str`, *optional*, defaults to `""`): - The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. - additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): - Additional special tokens used by the tokenizer. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = CamembertTokenizer - - def __init__( - self, - vocab_file=None, - tokenizer_file=None, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], - **kwargs, - ): - # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False - mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token - super().__init__( - vocab_file, - tokenizer_file=tokenizer_file, - bos_token=bos_token, - eos_token=eos_token, - sep_token=sep_token, - cls_token=cls_token, - unk_token=unk_token, - pad_token=pad_token, - mask_token=mask_token, - additional_special_tokens=additional_special_tokens, - **kwargs, - ) - - self.vocab_file = vocab_file - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An CamemBERT sequence has the following format: - - - single sequence: ` X ` - - pair of sequences: ` A B ` - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + sep + token_ids_1 + sep - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like - RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not self.can_save_slow_tokenizer: - raise ValueError( - "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " - "tokenizer." - ) - - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - out_vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): - copyfile(self.vocab_file, out_vocab_file) - - return (out_vocab_file,) - - -__all__ = ["CamembertTokenizerFast"] diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py index f6b2a8bfd96e..090041a7f84f 100644 --- a/src/transformers/models/canine/tokenization_canine.py +++ b/src/transformers/models/canine/tokenization_canine.py @@ -14,9 +14,7 @@ # limitations under the License. """Tokenization classes for CANINE.""" -from typing import Optional - -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken, PreTrainedTokenizer from ...utils import logging @@ -112,6 +110,9 @@ def __init__( mask_token=mask_token, add_prefix_space=add_prefix_space, model_max_length=model_max_length, + token_type_ids_pattern="all_zeros", + token_type_ids_include_special_tokens=True, + special_tokens_pattern="cls_sep", **kwargs, ) @@ -150,64 +151,5 @@ def _convert_id_to_token(self, index: int) -> str: def convert_tokens_to_string(self, tokens): return "".join(tokens) - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A CANINE sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - result = cls + token_ids_0 + sep - if token_ids_1 is not None: - result += token_ids_1 + sep - return result - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - result = [1] + ([0] * len(token_ids_0)) + [1] - if token_ids_1 is not None: - result += ([0] * len(token_ids_1)) + [1] - return result - - # CanineTokenizer has no vocab file - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): - return () - __all__ = ["CanineTokenizer"] diff --git a/src/transformers/models/clip/__init__.py b/src/transformers/models/clip/__init__.py index 36fb3521a93e..b899e69bc8f2 100644 --- a/src/transformers/models/clip/__init__.py +++ b/src/transformers/models/clip/__init__.py @@ -25,7 +25,6 @@ from .modeling_clip import * from .processing_clip import * from .tokenization_clip import * - from .tokenization_clip_fast import * else: import sys diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py index 625d26dc6960..dc6995b2aa95 100644 --- a/src/transformers/models/clip/tokenization_clip.py +++ b/src/transformers/models/clip/tokenization_clip.py @@ -14,258 +14,29 @@ # limitations under the License. """Tokenization classes for CLIP.""" -import json -import os -import unicodedata -from functools import lru_cache from typing import Optional -import regex as re +from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import BPE -from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...tokenization_utils_tokenizers import TokenizersBackend from ...utils import logging logger = logging.get_logger(__name__) -VOCAB_FILES_NAMES = { - "vocab_file": "vocab.json", - "merges_file": "merges.txt", -} +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} -@lru_cache -def bytes_to_unicode(): +class CLIPTokenizer(TokenizersBackend): """ - Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control - characters the bpe code barfs on. + Construct a CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. - The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab - if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for - decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup - tables between utf-8 bytes and unicode strings. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -def get_pairs(word): - """ - Return set of symbol pairs in a word. - - Word is represented as tuple of symbols (symbols being variable-length strings). - """ - pairs = set() - prev_char = word[0] - for char in word[1:]: - pairs.add((prev_char, char)) - prev_char = char - return pairs - - -def whitespace_clean(text): - text = re.sub(r"\s+", " ", text) - text = text.strip() - return text - - -# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer -class BasicTokenizer: - """ - Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. Args: - do_lower_case (`bool`, *optional*, defaults to `True`): - Whether or not to lowercase the input when tokenizing. - never_split (`Iterable`, *optional*): - Collection of tokens which will never be split during tokenization. Only has an effect when - `do_basic_tokenize=True` - tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): - Whether or not to tokenize Chinese characters. - - This should likely be deactivated for Japanese (see this - [issue](https://github.com/huggingface/transformers/issues/328)). - strip_accents (`bool`, *optional*): - Whether or not to strip all accents. If this option is not specified, then it will be determined by the - value for `lowercase` (as in the original BERT). - do_split_on_punc (`bool`, *optional*, defaults to `True`): - In some instances we want to skip the basic punctuation splitting so that later tokenization can capture - the full context of the words, such as contractions. - """ - - def __init__( - self, - do_lower_case=True, - never_split=None, - tokenize_chinese_chars=True, - strip_accents=None, - do_split_on_punc=True, - ): - if never_split is None: - never_split = [] - self.do_lower_case = do_lower_case - self.never_split = set(never_split) - self.tokenize_chinese_chars = tokenize_chinese_chars - self.strip_accents = strip_accents - self.do_split_on_punc = do_split_on_punc - - def tokenize(self, text, never_split=None): - """ - Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. - - Args: - never_split (`List[str]`, *optional*) - Kept for backward compatibility purposes. Now implemented directly at the base class level (see - [`PreTrainedTokenizer.tokenize`]) List of token not to split. - """ - # union() returns a new set by concatenating the two sets. - never_split = self.never_split.union(set(never_split)) if never_split else self.never_split - text = self._clean_text(text) - - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - if self.tokenize_chinese_chars: - text = self._tokenize_chinese_chars(text) - # prevents treating the same character with different unicode codepoints as different characters - unicode_normalized_text = unicodedata.normalize("NFC", text) - orig_tokens = whitespace_tokenize(unicode_normalized_text) - split_tokens = [] - for token in orig_tokens: - if token not in never_split: - if self.do_lower_case: - token = token.lower() - if self.strip_accents is not False: - token = self._run_strip_accents(token) - elif self.strip_accents: - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token, never_split)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text, never_split=None): - """Splits punctuation on a piece of text.""" - if not self.do_split_on_punc or (never_split is not None and text in never_split): - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ( - (cp >= 0x4E00 and cp <= 0x9FFF) - or (cp >= 0x3400 and cp <= 0x4DBF) - or (cp >= 0x20000 and cp <= 0x2A6DF) - or (cp >= 0x2A700 and cp <= 0x2B73F) - or (cp >= 0x2B740 and cp <= 0x2B81F) - or (cp >= 0x2B820 and cp <= 0x2CEAF) - or (cp >= 0xF900 and cp <= 0xFAFF) - or (cp >= 0x2F800 and cp <= 0x2FA1F) - ): - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xFFFD or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class CLIPTokenizer(PreTrainedTokenizer): - """ - Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding. - - This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to - this superclass for more information regarding those methods. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - merges_file (`str`): - Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. @@ -275,53 +46,94 @@ class CLIPTokenizer(PreTrainedTokenizer): The end of sequence token. pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The token used for padding, for example when batching sequences of different lengths. + vocab (`dict`, *optional*): + Vocabulary dict to use for the tokenizer. + merges (`list`, *optional*): + Merges list to use for the BPE tokenizer. + vocab_file (`str`, *optional*): + Path to the vocabulary file. + merges_file (`str`, *optional*): + Path to the merges file. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None def __init__( self, - vocab_file, - merges_file, - errors="replace", - unk_token="<|endoftext|>", - bos_token="<|startoftext|>", - eos_token="<|endoftext|>", - pad_token="<|endoftext|>", # hack to enable padding + unk_token: str = "<|endoftext|>", + bos_token: str = "<|startoftext|>", + eos_token: str = "<|endoftext|>", + pad_token: str = "<|endoftext|>", + vocab: Optional[dict] = None, + merges: Optional[list] = None, + vocab_file: Optional[str] = None, + merges_file: Optional[str] = None, **kwargs, ): - bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token - eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token - unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token - try: - import ftfy + self.vocab_file = vocab_file + self.merges_file = merges_file + + if vocab is not None: + _vocab = {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab + else: + _vocab = { + str(bos_token): 0, + str(eos_token): 1, + str(pad_token): 2, + } + + if merges is not None: + _merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges] + else: + _merges = [] + + self._tokenizer = Tokenizer( + BPE( + vocab=_vocab, + merges=_merges, + dropout=None, + continuing_subword_prefix="", + end_of_word_suffix="", + fuse_unk=False, + unk_token=str(unk_token), + ) + ) + + self._tokenizer.normalizer = normalizers.Sequence( + [normalizers.NFC(), normalizers.Replace(Regex(r"\s+"), " "), normalizers.Lowercase()] + ) + + self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split( + Regex( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""" + ), + behavior="removed", + invert=True, + ), + pre_tokenizers.ByteLevel(add_prefix_space=False), + ] + ) - self.fix_text = ftfy.fix_text - except ImportError: - logger.info("ftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.") - self.nlp = BasicTokenizer(strip_accents=False, do_split_on_punc=False) - self.fix_text = None + self._tokenizer.decoder = decoders.ByteLevel() - with open(vocab_file, encoding="utf-8") as vocab_handle: - self.encoder = json.load(vocab_handle) - self.decoder = {v: k for k, v in self.encoder.items()} - self.errors = errors # how to handle errors in decoding - self.byte_encoder = bytes_to_unicode() - self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} - with open(merges_file, encoding="utf-8") as merges_handle: - bpe_merges = merges_handle.read().strip().split("\n")[1 : 49152 - 256 - 2 + 1] - bpe_merges = [tuple(merge.split()) for merge in bpe_merges] - self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - self.cache = {"<|startoftext|>": "<|startoftext|>", "<|endoftext|>": "<|endoftext|>"} + bos_token_id = _vocab.get(str(bos_token), 0) + eos_token_id = _vocab.get(str(eos_token), 1) - self.pat = re.compile( - r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", - re.IGNORECASE, + self._tokenizer.post_processor = processors.RobertaProcessing( + sep=(str(eos_token), eos_token_id), + cls=(str(bos_token), bos_token_id), + add_prefix_space=False, + trim_offsets=False, ) + tokenizer_object = self._tokenizer + super().__init__( - errors=errors, + tokenizer_object=tokenizer_object, unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, @@ -329,191 +141,27 @@ def __init__( **kwargs, ) - @property - def vocab_size(self): - return len(self.encoder) - - def get_vocab(self): - return dict(self.encoder, **self.added_tokens_encoder) - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A CLIP sequence has the following format: - - - single sequence: `<|startoftext|> X <|endoftext|>` - - Pairs of sequences are not the expected use case, but they will be handled without a separator. - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - bos_token = [self.bos_token_id] - eos_token = [self.eos_token_id] - - if token_ids_1 is None: - return bos_token + token_ids_0 + eos_token - return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token - - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. + if hasattr(self, "_post_init"): + self._post_init() - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ + def _post_init(self): + super()._post_init() + self._wrap_decode_method_backend_tokenizer() - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] + [1] + ([0] * len(token_ids_1)) + [1] - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of - zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - bos_token = [self.bos_token_id] - eos_token = [self.eos_token_id] - - if token_ids_1 is None: - return len(bos_token + token_ids_0 + eos_token) * [0] - return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] - - def bpe(self, token): - if token in self.cache: - return self.cache[token] - word = tuple(token[:-1]) + (token[-1] + "",) - pairs = get_pairs(word) - - if not pairs: - return token + "" - - while True: - bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) - if bigram not in self.bpe_ranks: - break - first, second = bigram - new_word = [] - i = 0 - while i < len(word): - try: - j = word.index(first, i) - except ValueError: - new_word.extend(word[i:]) - break - else: - new_word.extend(word[i:j]) - i = j - - if word[i] == first and i < len(word) - 1 and word[i + 1] == second: - new_word.append(first + second) - i += 2 - else: - new_word.append(word[i]) - i += 1 - new_word = tuple(new_word) - word = new_word - if len(word) == 1: - break - else: - pairs = get_pairs(word) - word = " ".join(word) - self.cache[token] = word - return word - - def _tokenize(self, text): - """Tokenize a string.""" - bpe_tokens = [] - if self.fix_text is None: - text = " ".join(self.nlp.tokenize(text)) - else: - text = whitespace_clean(self.fix_text(text)).lower() - - for token in re.findall(self.pat, text): - token = "".join( - self.byte_encoder[b] for b in token.encode("utf-8") - ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) - bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) - return bpe_tokens - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.encoder.get(token, self.encoder.get(self.unk_token)) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.decoder.get(index) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (string) in a single string.""" - text = "".join(tokens) - byte_array = bytearray([self.byte_decoder[c] for c in text]) - text = byte_array.decode("utf-8", errors=self.errors).replace("", " ").strip() - return text - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - if not os.path.isdir(save_directory): - logger.error(f"Vocabulary path ({save_directory}) should be a directory") - return - vocab_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] - ) - merge_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] - ) + # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872 + def _wrap_decode_method_backend_tokenizer(self): + orig_decode_method = self.backend_tokenizer.decode - with open(vocab_file, "w", encoding="utf-8") as f: - f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + ## define this as a local variable to avoid circular reference + ## See: https://github.com/huggingface/transformers/issues/30930 + end_of_word_suffix = self.backend_tokenizer.model.end_of_word_suffix - index = 0 - with open(merge_file, "w", encoding="utf-8") as writer: - writer.write("#version: 0.2\n") - for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning( - f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." - " Please check that the tokenizer is not corrupted!" - ) - index = token_index - writer.write(" ".join(bpe_tokens) + "\n") - index += 1 + def new_decode_method(*args, **kwargs): + text = orig_decode_method(*args, **kwargs) + text = text.replace(end_of_word_suffix, " ").strip() + return text - return vocab_file, merge_file + self.backend_tokenizer.decode = new_decode_method __all__ = ["CLIPTokenizer"] diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py deleted file mode 100644 index c859d4572df7..000000000000 --- a/src/transformers/models/clip/tokenization_clip_fast.py +++ /dev/null @@ -1,164 +0,0 @@ -# coding=utf-8 -# Copyright 2021 The Open AI Team Authors and The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization classes for OpenAI GPT.""" - -from typing import Optional - -from tokenizers import pre_tokenizers - -from ...tokenization_utils_fast import PreTrainedTokenizerFast -from ...utils import logging -from .tokenization_clip import CLIPTokenizer - - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} - - -class CLIPTokenizerFast(PreTrainedTokenizerFast): - """ - Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level - Byte-Pair-Encoding. - - This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should - refer to this superclass for more information regarding those methods. - - Args: - vocab_file (`str`, *optional*): - Path to the vocabulary file. - merges_file (`str`, *optional*): - Path to the merges file. - tokenizer_file (`str`, *optional*): - The path to a tokenizer file to use instead of the vocab file. - unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. - bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`): - The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): - The end of sequence token. - pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`): - The token used for padding, for example when batching sequences of different lengths. - """ - - vocab_files_names = VOCAB_FILES_NAMES - model_input_names = ["input_ids", "attention_mask"] - slow_tokenizer_class = CLIPTokenizer - - def __init__( - self, - vocab_file=None, - merges_file=None, - tokenizer_file=None, - unk_token="<|endoftext|>", - bos_token="<|startoftext|>", - eos_token="<|endoftext|>", - pad_token="<|endoftext|>", # hack to enable padding - **kwargs, - ): - super().__init__( - vocab_file, - merges_file, - tokenizer_file=tokenizer_file, - unk_token=unk_token, - bos_token=bos_token, - eos_token=eos_token, - pad_token=pad_token, - **kwargs, - ) - - if not isinstance(self.backend_tokenizer.pre_tokenizer, pre_tokenizers.Sequence): - raise TypeError( - "The `backend_tokenizer` provided does not match the expected format. The CLIP tokenizer has been" - " heavily modified from transformers version 4.17.0. You need to convert the tokenizer you are using" - " to be compatible with this version.The easiest way to do so is" - ' `CLIPTokenizerFast.from_pretrained("path_to_local_folder_or_hub_repo, from_slow=True)`. If you want' - " to use your existing tokenizer, you will have to revert to a version prior to 4.17.0 of" - " transformers." - ) - self._wrap_decode_method_backend_tokenizer() - - # Very ugly hack to enable padding to have a correct decoding see https://github.com/huggingface/tokenizers/issues/872 - def _wrap_decode_method_backend_tokenizer(self): - orig_decode_method = self.backend_tokenizer.decode - - ## define this as a local variable to avoid circular reference - ## See: https://github.com/huggingface/transformers/issues/30930 - end_of_word_suffix = self.backend_tokenizer.model.end_of_word_suffix - - def new_decode_method(*args, **kwargs): - text = orig_decode_method(*args, **kwargs) - text = text.replace(end_of_word_suffix, " ").strip() - return text - - self.backend_tokenizer.decode = new_decode_method - - def build_inputs_with_special_tokens( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A CLIP sequence has the following format: - - - single sequence: `<|startoftext|> X <|endoftext|>` - - Pairs of sequences are not the expected use case, but they will be handled without a separator. - - Args: - token_ids_0 (`list[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - bos_token = [self.bos_token_id] - eos_token = [self.eos_token_id] - - if token_ids_1 is None: - return bos_token + token_ids_0 + eos_token - return bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token - - def create_token_type_ids_from_sequences( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None - ) -> list[int]: - """ - Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of - zeros is returned. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `list[int]`: List of zeros. - """ - bos_token = [self.bos_token_id] - eos_token = [self.eos_token_id] - - if token_ids_1 is None: - return len(bos_token + token_ids_0 + eos_token) * [0] - return len(bos_token + token_ids_0 + eos_token + eos_token + token_ids_1 + eos_token) * [0] - - def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: - files = self._tokenizer.model.save(save_directory, name=filename_prefix) - return tuple(files) - - -__all__ = ["CLIPTokenizerFast"] diff --git a/src/transformers/models/clvp/tokenization_clvp.py b/src/transformers/models/clvp/tokenization_clvp.py index 4b0b285561c5..af7bb334c758 100644 --- a/src/transformers/models/clvp/tokenization_clvp.py +++ b/src/transformers/models/clvp/tokenization_clvp.py @@ -21,7 +21,7 @@ import regex as re -from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...tokenization_python import AddedToken, PreTrainedTokenizer from ...utils import logging from .number_normalizer import EnglishNormalizer @@ -35,7 +35,6 @@ @lru_cache -# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode def bytes_to_unicode(): """ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control @@ -60,7 +59,6 @@ def bytes_to_unicode(): return dict(zip(bs, cs)) -# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs def get_pairs(word): """ Return set of symbol pairs in a word. @@ -160,6 +158,16 @@ def __init__( self.add_eos_token = add_eos_token self._normalizer = None + # Set special_tokens_pattern based on add_bos_token and add_eos_token flags + if add_bos_token and add_eos_token: + kwargs["special_tokens_pattern"] = "bos_eos" + elif add_bos_token: + kwargs["special_tokens_pattern"] = "bos" + elif add_eos_token: + kwargs["special_tokens_pattern"] = "eos" + else: + kwargs["special_tokens_pattern"] = "none" + with open(vocab_file, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) self.decoder = {v: k for k, v in self.encoder.items()} @@ -201,7 +209,6 @@ def normalizer(self): def get_vocab(self): return dict(self.encoder, **self.added_tokens_encoder) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe def bpe(self, token): if token in self.cache: return self.cache[token] @@ -244,7 +251,6 @@ def bpe(self, token): self.cache[token] = word return word - # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] @@ -256,39 +262,6 @@ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): return output - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_special_tokens_mask - def get_special_tokens_mask( - self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False - ) -> list[int]: - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. - - Args: - token_ids_0 (`list[int]`): - List of IDs. - token_ids_1 (`list[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to `False`): - Whether or not the token list is already formatted with special tokens for the model. - - Returns: - `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - if not self.add_bos_token: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False - ) - - if token_ids_1 is None: - return [1] + ([0] * len(token_ids_0)) - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) - def _tokenize(self, text): """Tokenize a string.""" bpe_tokens = [] @@ -306,17 +279,14 @@ def _tokenize(self, text): return bpe_tokens - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" return self.encoder.get(token, self.encoder.get(self.unk_token)) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token def _convert_id_to_token(self, index): """Converts an index (integer) in a token (str) using the vocab.""" return self.decoder.get(index) - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" text = "".join(tokens) @@ -324,7 +294,7 @@ def convert_tokens_to_string(self, tokens): return text def clean_up_tokenization(self, text): - text = "".join(text) + text = "".join(text) if isinstance(text, list) else text vocab_tokens = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys()) text = text.replace("[SPACE]", " ") if "[SPACE]" in vocab_tokens else text @@ -333,7 +303,6 @@ def clean_up_tokenization(self, text): text = text.replace(self.unk_token, "").replace(" ", " ").replace(" ", " ") return text - # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: if not os.path.isdir(save_directory): logger.error(f"Vocabulary path ({save_directory}) should be a directory") diff --git a/src/transformers/models/code_llama/__init__.py b/src/transformers/models/code_llama/__init__.py index b65c4bddb4b0..f37e07904132 100644 --- a/src/transformers/models/code_llama/__init__.py +++ b/src/transformers/models/code_llama/__init__.py @@ -19,7 +19,6 @@ if TYPE_CHECKING: from .tokenization_code_llama import * - from .tokenization_code_llama_fast import * else: import sys diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 94d1b4d65985..c2826741e82a 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -1,6 +1,5 @@ # coding=utf-8 -# Copyright 2023 MetaAI and the HuggingFace Inc. team. All rights reserved. -# +# Copyright 2023 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,23 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tokenization classes for Code LLaMA.""" - -import os -from shutil import copyfile -from typing import Any, Optional -import sentencepiece as spm +from tokenizers import AddedToken, Tokenizer, decoders, normalizers, pre_tokenizers, processors +from tokenizers.models import BPE -from ...convert_slow_tokenizer import import_protobuf -from ...tokenization_utils import AddedToken, PreTrainedTokenizer -from ...utils import logging, requires_backends -from ...utils.import_utils import requires +from ...tokenization_utils_base import _get_prepend_scheme, generate_merges +from ...tokenization_utils_tokenizers import TokenizersBackend +from ...utils import logging logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} SPIECE_UNDERLINE = "▁" @@ -47,19 +40,35 @@ # fmt: on -@requires(backends=("sentencepiece",)) -class CodeLlamaTokenizer(PreTrainedTokenizer): +class CodeLlamaTokenizer(TokenizersBackend): """ - Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as - there is no padding token in the original model. + Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. + + This uses notably ByteFallback and no normalization. + + ```python + >>> from transformers import CodeLlamaTokenizer + + >>> tokenizer = CodeLlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") + >>> tokenizer.encode("Hello this is a test") + [1, 15043, 445, 338, 263, 1243] + ``` - The default configuration match that of - [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json) + If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or + call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the + values of the first token and final token of an encoded sequence will not be correct). For more details, checkout + [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation. + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. The default configuration match that of + [meta-llama/CodeLlama-7b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json) which supports prompt infilling. Args: - vocab_file (`str`): - Path to the vocabulary file. + clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`): + Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra + spaces. unk_token (`str`, *optional*, defaults to `""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. @@ -67,14 +76,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. eos_token (`str`, *optional*, defaults to `""`): The end of sequence token. - - - - When building a sequence using special tokens, this is not the token that is used for the end of sequence. - The token used is the `sep_token`. - - - prefix_token (`str`, *optional*, defaults to `"▁
"`):
             Prefix token used for infilling.
         middle_token (`str`, *optional*, defaults to `"▁"`):
@@ -85,41 +86,33 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
             End of text token used for infilling.
         fill_token (`str`, *optional*, defaults to `""`):
             The token used to split the input between the prefix and suffix.
-        suffix_first (`bool`, *optional*, defaults to `False`):
-            Whether the input prompt and suffix should be formatted with the suffix first.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
+        additional_special_tokens (`list[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
         add_bos_token (`bool`, *optional*, defaults to `True`):
             Whether to add a beginning of sequence token at the start of sequences.
         add_eos_token (`bool`, *optional*, defaults to `False`):
             Whether to add an end of sequence token at the end of sequences.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to clean up the tokenization spaces.
-        additional_special_tokens (`list[str]`, *optional*):
-            Additional special tokens used by the tokenizer.
         use_default_system_prompt (`bool`, *optional*, defaults to `False`):
             Whether or not the default system prompt for Llama should be used.
+        add_prefix_space (`bool`, *optional*):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
+        vocab_file (`str`, *optional*):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    padding_side = "left"
     model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,
-        vocab_file,
+        clean_up_tokenization_spaces=False,
         unk_token="",
         bos_token="",
         eos_token="",
@@ -128,73 +121,96 @@ def __init__(
         suffix_token="▁",
         eot_token="▁",
         fill_token="",
-        suffix_first=False,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        additional_special_tokens=None,
         add_bos_token=True,
         add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        additional_special_tokens=None,
         use_default_system_prompt=False,
+        add_prefix_space=None,
+        vocab=None,
+        merges=None,
+        vocab_file=None,
         **kwargs,
     ):
-        requires_backends(self, "protobuf")
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-
+        self.add_prefix_space = add_prefix_space if add_prefix_space is not None else True
         self.use_default_system_prompt = use_default_system_prompt
-        # mark tokens special to skip them
+
         additional_special_tokens = additional_special_tokens or []
-        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+        for token in [prefix_token, middle_token, suffix_token, eot_token, fill_token]:
             additional_special_tokens += [token] if token is not None else []
 
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self._prefix_token = prefix_token
-        self._middle_token = middle_token
-        self._suffix_token = suffix_token
-        self._eot_token = eot_token
-        self.fill_token = fill_token
-        self.suffix_first = suffix_first
-        self.sp_model = self.get_spm_processor()
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(unk_token): 0,
+                str(bos_token): 1,
+                str(eos_token): 2,
+            }
+
+        filtered_vocab = {
+            t: i for t, i in self._vocab.items() if t not in {str(eos_token), str(bos_token), str(unk_token)}
+        }
+        self._merges = merges if merges is not None else generate_merges(filtered_vocab)
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                fuse_unk=True,
+                byte_fallback=True,
+                dropout=None,
+                unk_token=str(unk_token),
+            )
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+            replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
+        )
+
+        self._tokenizer.decoder = decoders.Sequence(
+            [decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse(), decoders.Strip(content=" ", left=1)]
+        )
 
         super().__init__(
+            tokenizer_object=self._tokenizer,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            add_prefix_space=add_prefix_space,
             prefix_token=prefix_token,
             middle_token=middle_token,
             suffix_token=suffix_token,
             eot_token=eot_token,
             fill_token=fill_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            suffix_first=suffix_first,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             additional_special_tokens=additional_special_tokens,
-            use_default_system_prompt=use_default_system_prompt,
             **kwargs,
         )
 
-    @property
-    def unk_token_length(self):
-        return len(self.sp_model.encode(str(self.unk_token)))
-
-    def get_spm_processor(self):
-        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        with open(self.vocab_file, "rb") as f:
-            sp_model = f.read()
-            model_pb2 = import_protobuf()
-            model = model_pb2.ModelProto.FromString(sp_model)
-            normalizer_spec = model_pb2.NormalizerSpec()
-            normalizer_spec.add_dummy_prefix = False
-            model.normalizer_spec.MergeFrom(normalizer_spec)
-            sp_model = model.SerializeToString()
-            tokenizer.LoadFromSerializedProto(sp_model)
-        return tokenizer
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.vocab_file = vocab_file
+
+        self._prefix_token = prefix_token
+        self._middle_token = middle_token
+        self._suffix_token = suffix_token
+        self._eot_token = eot_token
+        self.fill_token = fill_token
+
+        self._post_init()
+
+    def _post_init(self):
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="first", split=False)
+        self._tokenizer.normalizer = None
+
+        # This matches LlamaTokenizer's behavior and is needed when loading from vocab/merges
+        self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
+
+        self.update_post_processor()
+        super()._post_init()
 
     @property
     def prefix_token(self):
@@ -226,10 +242,6 @@ def suffix_id(self):
             return None
         return self.convert_tokens_to_ids(self.suffix_token)
 
-    @property
-    def eot_token(self):
-        return self._eot_token
-
     @property
     def eot_id(self):
         if self._eot_token is None:
@@ -237,218 +249,136 @@ def eot_id(self):
         return self.convert_tokens_to_ids(self.eot_token)
 
     @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
+    def eot_token(self):
+        return self._eot_token
+
+    def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
+        """
+        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
+        following: if suffix_first
+            " 
 {suf}  {pre}"
+        else:
+            " 
 {pre} {suf} "
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
+        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
+        is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
+        """
+        if reset:
+            self._tokenizer.normalizer = normalizers.Sequence(
+                [
+                    normalizers.Prepend(prepend="▁"),
+                    normalizers.Replace(pattern=" ", content="▁"),
+                ]
+            )
+            self.update_post_processor()
+            return
 
-    def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> list[int]:
-        # add a prefix space to `prefix`
-        if self.fill_token is not None and self.fill_token in prefix and suffix is None:
-            prefix, suffix = prefix.split(self.fill_token)
+        self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
+        pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
+        special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
+        if suffix_first:
+            # format as " 
 {suf}  {pre}"
+            pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
+            special_tokens += [
+                (self.prefix_token, self.prefix_id),
+                (self.suffix_token, self.suffix_id),
+                (self.middle_token, self.middle_id),
+            ]
+        else:
+            # format as " 
 {pre} {suf} "
+            pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
+            special_tokens += [
+                (self.prefix_token, self.prefix_id),
+                (self.suffix_token, self.suffix_id),
+                (self.middle_token, self.middle_id),
+            ]
+
+        if self.add_eos_token and add_special_tokens:
+            pair += [self.eos_token]
+            special_tokens += [(self.eos_token, self.eos_token_id)]
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single="$A", pair=pair, special_tokens=special_tokens
+        )
 
-        if len(prefix) > 0:
-            prefix = SPIECE_UNDERLINE + prefix.replace(SPIECE_UNDERLINE, " ")
+    def tokenize(self, text, suffix=None, suffix_first=False, **kwargs):
+        # Handle fill_token splitting
+        if self.fill_token is not None and self.fill_token in text and suffix is None:
+            text, suffix = text.split(self.fill_token)
 
+        # If no suffix, use standard tokenization
         if suffix is None or len(suffix) < 1:
-            tokens = super().tokenize(prefix, **kwargs)
-            if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
-                tokens = tokens[1:]
-            return tokens
-
-        prefix_tokens = self._tokenize(prefix)  # prefix has an extra `SPIECE_UNDERLINE`
+            return super().tokenize(text, **kwargs)
 
+        # Check that infilling tokens are available
         if None in (self.prefix_id, self.middle_id, self.suffix_id):
             raise ValueError(
                 "The input either includes a `prefix` and a `suffix` used for the infilling task,"
                 f"  or can be split on the {self.fill_token} token, creating a suffix and prefix,"
                 " but the model does not support `infilling`."
             )
-        suffix_tokens = self._tokenize(suffix)  # make sure CodeLlama sp model does not mess up
-
-        suffix_first = suffix_first if suffix_first is not None else self.suffix_first
-        if suffix_first:
-            # format as " 
 {suf}  {pre}"
-            return [self.prefix_token, self.suffix_token] + suffix_tokens + [self.middle_token] + prefix_tokens
-        else:
-            # format as " 
 {pre} {suf} "
-            return [self.prefix_token] + prefix_tokens + [self.suffix_token] + suffix_tokens + [self.middle_token]
 
-    def _tokenize(self, text, **kwargs):
-        """
-        Returns a tokenized string.
+        # Temporarily set infilling processor
+        self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=False)
 
-        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
-        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
-        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
-        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
-        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
-        """
-        tokens = self.sp_model.encode(text, out_type=str)
-        if not text.startswith((SPIECE_UNDERLINE, " ")):
-            return tokens
-        # 1. Encode string + prefix ex: " Hey"
-        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
-        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
-
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        # since we manually add the prefix space, we have to remove it when decoding
-        if tokens[0].startswith(SPIECE_UNDERLINE):
-            tokens[0] = tokens[0][1:]
-
-        current_sub_tokens = []
-        out_string = ""
-        for _, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
+        # Remove text_pair and pair from kwargs if present to avoid conflict
+        kwargs.pop("text_pair", None)
+        kwargs.pop("pair", None)
 
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
+        # Tokenize with infilling format
+        # The processor will handle the special token arrangement
+        # Use pair=suffix (not text_pair) since base class tokenize expects 'pair' parameter
+        result = super().tokenize(" " + text, pair=suffix, **kwargs)
 
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
+        # Reset processor
+        self.set_infilling_processor(True)
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
+        return result
 
-        return (out_vocab_file,)
+    def _encode_plus(self, text, text_pair=None, suffix=None, suffix_first=False, add_special_tokens=True, **kwargs):
+        is_infilling = False
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        if suffix is not None:
+            text_pair = suffix
+            is_infilling = True
+        elif "suffix" in kwargs:
+            text_pair = kwargs.pop("suffix")
+            is_infilling = True
 
-        output = bos_token_id + token_ids_0 + eos_token_id
+        if isinstance(text, str) and self.fill_token is not None and self.fill_token in text and text_pair is None:
+            text, text_pair = text.split(self.fill_token)
+            is_infilling = True
 
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
+        if not is_infilling:
+            return super()._encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
 
-        return output
+        if (
+            text_pair is None
+            or (isinstance(text_pair, str) and len(text_pair) < 1)
+            or (isinstance(text_pair, list) and len(text_pair) == 0)
+        ):
+            return super()._encode_plus(text, text_pair=text_pair, add_special_tokens=add_special_tokens, **kwargs)
 
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+        if None in (self.prefix_id, self.middle_id, self.suffix_id):
+            raise ValueError(
+                "The input includes a `prefix` and a `suffix` used for the infilling task,"
+                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
+                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
             )
 
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
-
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
+        kwargs.pop("text_pair", None)
 
-        return output
+        if isinstance(text, str):
+            text = " " + text
+        elif isinstance(text, list):
+            text = [" " + t if isinstance(t, str) else t for t in text]
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
+        result = super()._encode_plus(text, text_pair=text_pair, add_special_tokens=True, **kwargs)
+        self.set_infilling_processor(True)
+        return result
 
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
 
+__all__ = ["CodeLlamaTokenizer", "CodeLlamaTokenizerFast"]
 
-__all__ = ["CodeLlamaTokenizer"]
+# Backward alias
+CodeLlamaTokenizerFast = CodeLlamaTokenizer
diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
deleted file mode 100644
index b3978587e7f0..000000000000
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ /dev/null
@@ -1,374 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import normalizers, processors
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_code_llama import CodeLlamaTokenizer
-else:
-    CodeLlamaTokenizer = None
-
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
-
-SPIECE_UNDERLINE = "▁"
-
-
-B_INST, E_INST = "[INST]", "[/INST]"
-B_SYS, E_SYS = "<>\n", "\n<>\n\n"
-
-# fmt: off
-DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
-answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
- that your responses are socially unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
-correct. If you don't know the answer to a question, please don't share false information."""
-# fmt: on
-
-
-class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    This uses notably ByteFallback and no normalization.
-
-    ```python
-    >>> from transformers import CodeLlamaTokenizerFast
-
-    >>> tokenizer = CodeLlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
-    >>> tokenizer.encode("Hello this is a test")
-    [1, 15043, 445, 338, 263, 1243]
-    ```
-
-    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
-    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
-    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
-    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
-
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods. The default configuration match that of
-    [meta-llama/CodeLlama-7b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
-    which supports prompt infilling.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        tokenizer_file (`str`, *optional*):
-            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`):
-            Whether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra
-            spaces.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-        prefix_token (`str`, *optional*, defaults to `"▁
"`):
-            Prefix token used for infilling.
-        middle_token (`str`, *optional*, defaults to `"▁"`):
-            Middle token used for infilling.
-        suffix_token (`str`, *optional*, defaults to `"▁"`):
-            Suffix token used for infilling.
-        eot_token (`str`, *optional*, defaults to `"▁"`):
-            End of text token used for infilling.
-        fill_token (`str`, *optional*, defaults to `""`):
-            The token used to split the input between the prefix and suffix.
-        additional_special_tokens (`list[str]`, *optional*):
-            Additional special tokens used by the tokenizer.
-        add_bos_token (`bool`, *optional*, defaults to `True`):
-            Whether to add a beginning of sequence token at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
-            Whether to add an end of sequence token at the end of sequences.
-        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not the default system prompt for Llama should be used.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = CodeLlamaTokenizer
-    padding_side = "left"
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        clean_up_tokenization_spaces=False,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        prefix_token="▁
",
-        middle_token="▁",
-        suffix_token="▁",
-        eot_token="▁",
-        fill_token="",
-        additional_special_tokens=None,
-        add_bos_token=True,
-        add_eos_token=False,
-        use_default_system_prompt=False,
-        **kwargs,
-    ):
-        # mark tokens special to skip them
-        additional_special_tokens = additional_special_tokens or []
-        for token in [prefix_token, middle_token, suffix_token, eot_token]:
-            additional_special_tokens += [token] if token is not None else []
-        self.use_default_system_prompt = use_default_system_prompt
-
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            additional_special_tokens=additional_special_tokens,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            prefix_token=prefix_token,
-            middle_token=middle_token,
-            suffix_token=suffix_token,
-            eot_token=eot_token,
-            fill_token=fill_token,
-            use_default_system_prompt=use_default_system_prompt,
-            **kwargs,
-        )
-        self._add_bos_token = add_bos_token
-        self._add_eos_token = add_eos_token
-        self.update_post_processor()
-
-        self.vocab_file = vocab_file
-
-        self._prefix_token = prefix_token
-        self._middle_token = middle_token
-        self._suffix_token = suffix_token
-        self._eot_token = eot_token
-        self.fill_token = fill_token
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
-    def update_post_processor(self):
-        """
-        Updates the underlying post processor with the current `bos_token` and `eos_token`.
-        """
-        bos = self.bos_token
-        bos_token_id = self.bos_token_id
-        if bos is None and self.add_bos_token:
-            raise ValueError("add_bos_token = True but bos_token = None")
-
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        if eos is None and self.add_eos_token:
-            raise ValueError("add_eos_token = True but eos_token = None")
-
-        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
-
-        special_tokens = []
-        if self.add_bos_token:
-            special_tokens.append((bos, bos_token_id))
-        if self.add_eos_token:
-            special_tokens.append((eos, eos_token_id))
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=single, pair=pair, special_tokens=special_tokens
-        )
-
-    @property
-    def prefix_token(self):
-        return self._prefix_token
-
-    @property
-    def prefix_id(self):
-        if self._prefix_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.prefix_token)
-
-    @property
-    def middle_token(self):
-        return self._middle_token
-
-    @property
-    def middle_id(self):
-        if self._middle_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.middle_token)
-
-    @property
-    def suffix_token(self):
-        return self._suffix_token
-
-    @property
-    def suffix_id(self):
-        if self._suffix_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.suffix_token)
-
-    @property
-    def eot_id(self):
-        if self._eot_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.eot_token)
-
-    @property
-    def eot_token(self):
-        return self._eot_token
-
-    @property
-    def add_eos_token(self):
-        return self._add_eos_token
-
-    @property
-    def add_bos_token(self):
-        return self._add_bos_token
-
-    @add_eos_token.setter
-    def add_eos_token(self, value):
-        self._add_eos_token = value
-        self.update_post_processor()
-
-    @add_bos_token.setter
-    def add_bos_token(self, value):
-        self._add_bos_token = value
-        self.update_post_processor()
-
-    def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=True):
-        """
-        Updates the normalizer to make sure the prompt format for `infilling` is respected. The infilling format is the
-        following: if suffix_first
-            " 
 {suf}  {pre}"
-        else:
-            " 
 {pre} {suf} "
-
-        If `reset` is set to `True`, the `normalizer` and `post_processor` are reset to their "normal" behaviour, which
-        is to add a prefix space for the normalizer, and add a `bos_token` to the input text for the `post_processor`.
-        """
-        if reset:
-            self._tokenizer.normalizer = normalizers.Sequence(
-                [
-                    normalizers.Prepend(prepend="▁"),
-                    normalizers.Replace(pattern=" ", content="▁"),
-                ]
-            )
-            self.update_post_processor()
-            return
-
-        self._tokenizer.normalizer = normalizers.Replace(pattern=" ", content="▁")
-        pair = [self.bos_token] if self.add_bos_token and add_special_tokens else []
-        special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
-        if suffix_first:
-            # format as " 
 {suf}  {pre}"
-            pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
-            special_tokens += [
-                (self.prefix_token, self.prefix_id),
-                (self.suffix_token, self.suffix_id),
-                (self.middle_token, self.middle_id),
-            ]
-        else:
-            # format as " 
 {pre} {suf} "
-            pair += [self.prefix_token, "$A", self.suffix_token, "$B", self.middle_token]
-            special_tokens += [
-                (self.prefix_token, self.prefix_id),
-                (self.suffix_token, self.suffix_id),
-                (self.middle_token, self.middle_id),
-            ]
-
-        if self.add_eos_token and add_special_tokens:
-            pair += [self.eos_token]
-            special_tokens += [(self.eos_token, self.eos_token_id)]
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single="$A", pair=pair, special_tokens=special_tokens
-        )
-
-    def encode_plus(self, text, text_pair=None, suffix_first=False, add_special_tokens=True, **kwargs):
-        # hack to make sure the input is pre-process but outside rust
-        text_pair = kwargs.pop("suffix", text_pair)
-        if self.fill_token is not None and self.fill_token in text and text_pair is None:
-            text, text_pair = text.split(self.fill_token)
-
-        if text_pair is None or len(text_pair) < 1:
-            return super().encode_plus(text, text_pair, add_special_tokens=add_special_tokens, **kwargs)
-
-        if None in (self.prefix_id, self.middle_id, self.suffix_id):
-            raise ValueError(
-                "Then input includes a `prefix` and a `suffix` used for the infilling task,"
-                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
-                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
-            )
-
-        self.set_infilling_processor(False, suffix_first=suffix_first, add_special_tokens=add_special_tokens)
-        tokens = super().encode_plus(" " + text, text_pair=text_pair, add_special_tokens=True, **kwargs)
-        self.set_infilling_processor(True)
-        return tokens
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An NLLB sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.bos_token_id + token_ids_0 + self.eos_token_id
-        return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
-
-
-__all__ = ["CodeLlamaTokenizerFast"]
diff --git a/src/transformers/models/codegen/__init__.py b/src/transformers/models/codegen/__init__.py
index ea2d9af11150..a06b0c4883f5 100644
--- a/src/transformers/models/codegen/__init__.py
+++ b/src/transformers/models/codegen/__init__.py
@@ -18,10 +18,10 @@
 
 
 if TYPE_CHECKING:
+    from ..gpt2.tokenization_gpt2 import GPT2Tokenizer as CodeGenTokenizerFast
     from .configuration_codegen import *
     from .modeling_codegen import *
     from .tokenization_codegen import *
-    from .tokenization_codegen_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py
index 4d08c6acd5bb..8bcb18d3414c 100644
--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@@ -12,75 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for CodeGen"""
+"""Tokenization classes for CodeGen."""
 
-import json
-import os
-from functools import lru_cache
+import re
 from typing import TYPE_CHECKING, Optional, Union
 
 import numpy as np
-import regex as re
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...utils import logging, to_py_obj
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import is_torch_available, logging
 
 
 if TYPE_CHECKING:
-    import torch
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+    if is_torch_available():
+        import torch
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
 
 
-class CodeGenTokenizer(PreTrainedTokenizer):
+class CodeGenTokenizer(TokenizersBackend):
     """
-    Construct a CodeGen tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Construct a CodeGen tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
@@ -96,26 +54,19 @@ class CodeGenTokenizer(PreTrainedTokenizer):
     [18435, 995]
     ```
 
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
 
     
 
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
 
     
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -129,19 +80,21 @@ class CodeGenTokenizer(PreTrainedTokenizer):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (CodeGen tokenizer detect beginning of words by the preceding space).
         add_bos_token (`bool`, *optional*, defaults to `False`):
-            Whether to add a beginning of sequence token at the start of sequences.
+            Whether or not to add an initial beginning of sentence token to the input.
         return_token_type_ids (`bool`, *optional*, defaults to `False`):
             Whether to return token type IDs.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
-        errors="replace",
         unk_token="<|endoftext|>",
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
@@ -149,34 +102,53 @@ def __init__(
         add_prefix_space=False,
         add_bos_token=False,
         return_token_type_ids=False,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
-        self.add_bos_token = add_bos_token
         self.return_token_type_ids = return_token_type_ids
         if self.return_token_type_ids:
             self.model_input_names.append("token_type_ids")
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
         self.add_prefix_space = add_prefix_space
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {}
+
+        if merges is not None:
+            self._merges = merges
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+        self._tokenizer.post_processor = processors.ByteLevel(
+            add_prefix_space=True, use_regex=True, trim_offsets=False
+        )
+
+        tokenizer_object = self._tokenizer
+
+        # Set these before calling super().__init__() so the base class _post_init() can use them
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = False
+
         super().__init__(
-            errors=errors,
+            tokenizer_object=tokenizer_object,
             unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
@@ -187,126 +159,14 @@ def __init__(
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        if self.add_bos_token:
-            bos_token_ids = [self.bos_token_id]
-        else:
-            bos_token_ids = []
-
-        output = bos_token_ids + token_ids_0
-
-        if token_ids_1 is None:
-            return output
-
-        return output + bos_token_ids + token_ids_1
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
+        self._post_init()
 
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if is_split_into_words or add_prefix_space:
-            text = " " + text
-        return (text, kwargs)
+    def _post_init(self):
+        self._tokenizer.post_processor = processors.ByteLevel(
+            add_prefix_space=True, use_regex=True, trim_offsets=False
+        )
+        # Ensure base class post-init runs to register special/extra tokens, etc.
+        super()._post_init()
 
     def decode(
         self,
@@ -341,9 +201,7 @@ def decode(
             `str`: The decoded sentence.
         """
 
-        token_ids = to_py_obj(token_ids)
-
-        decoded_text = super()._decode(
+        decoded_text = super().decode(
             token_ids=token_ids,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py
deleted file mode 100644
index 72c8d66c829a..000000000000
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-import re
-from typing import TYPE_CHECKING, Optional, Union
-
-import numpy as np
-
-from ...utils import is_torch_available, logging
-
-
-if TYPE_CHECKING:
-    if is_torch_available():
-        import torch
-
-
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from .tokenization_codegen import CodeGenTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class CodeGenTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" CodeGen tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import CodeGenTokenizerFast
-
-    >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
-    >>> tokenizer("Hello world")["input_ids"]
-    [15496, 995]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [18435, 995]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
-    the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
-        return_token_type_ids (`bool`, *optional*, defaults to `False`):
-            Whether to return token type IDs.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = CodeGenTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        add_prefix_space=False,
-        return_token_type_ids=False,
-        **kwargs,
-    ):
-        self.return_token_type_ids = return_token_type_ids
-        if self.return_token_type_ids:
-            self.model_input_names.append("token_type_ids")
-
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_prefix_space=add_prefix_space,
-            return_token_type_ids=return_token_type_ids,
-            **kwargs,
-        )
-
-        if kwargs.pop("add_bos_token", False):
-            model_id = kwargs.pop("name_or_path", "")
-            raise ValueError(
-                "Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
-                "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
-                f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
-                f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
-                "This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005."
-                " so that the fast tokenizer works correctly."
-            )
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    def decode(
-        self,
-        token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        truncate_before_pattern: Optional[list[str]] = None,
-        **kwargs,
-    ) -> str:
-        """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
-        tokens and clean up tokenization spaces.
-
-        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
-
-        Args:
-            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor]`):
-                List of tokenized input ids. Can be obtained using the `__call__` method.
-            skip_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*):
-                Whether or not to clean up the tokenization spaces. If `None`, will default to
-                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
-            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
-                A list of regular expression strings that will be used to truncate the returned string. This can be
-                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
-                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
-            kwargs (additional keyword arguments, *optional*):
-                Will be passed to the underlying model specific decode method.
-
-        Returns:
-            `str`: The decoded sentence.
-        """
-
-        decoded_text = super().decode(
-            token_ids=token_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
-            decoded_text = self.truncate(decoded_text, truncate_before_pattern)
-
-        return decoded_text
-
-    def truncate(self, completion, truncate_before_pattern):
-        def find_re(string, pattern, start_pos):
-            m = pattern.search(string, start_pos)
-            return m.start() if m else -1
-
-        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
-
-        prints = list(re.finditer("^print", completion, re.MULTILINE))
-
-        if len(prints) > 1:
-            completion = completion[: prints[1].start()]
-
-        defs = list(re.finditer("^def", completion, re.MULTILINE))
-
-        if len(defs) > 1:
-            completion = completion[: defs[1].start()]
-
-        start_pos = 0
-
-        terminals_pos = [
-            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
-        ]
-
-        if len(terminals_pos) > 0:
-            return completion[: min(terminals_pos)]
-        else:
-            return completion
-
-
-__all__ = ["CodeGenTokenizerFast"]
diff --git a/src/transformers/models/cohere/__init__.py b/src/transformers/models/cohere/__init__.py
index ad2d57500c44..98c73f4cd22d 100644
--- a/src/transformers/models/cohere/__init__.py
+++ b/src/transformers/models/cohere/__init__.py
@@ -20,7 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_cohere import *
     from .modeling_cohere import *
-    from .tokenization_cohere_fast import *
+    from .tokenization_cohere import *
 else:
     import sys
 
diff --git a/src/transformers/models/cohere/tokenization_cohere_fast.py b/src/transformers/models/cohere/tokenization_cohere.py
similarity index 59%
rename from src/transformers/models/cohere/tokenization_cohere_fast.py
rename to src/transformers/models/cohere/tokenization_cohere.py
index 99def0e8aa73..0b2e8f09a6dd 100644
--- a/src/transformers/models/cohere/tokenization_cohere_fast.py
+++ b/src/transformers/models/cohere/tokenization_cohere.py
@@ -13,19 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is based on the tokenization_llama_fast.py file in transformers
+# This file is based on the tokenization_llama.py file in transformers
 
-from typing import Literal, Union
+from typing import Literal, Optional, Union
 
-from tokenizers import processors
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
 
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "tokenizer_file": {
@@ -43,7 +43,7 @@
 # fmt: on
 
 
-class CohereTokenizerFast(PreTrainedTokenizerFast):
+class CohereTokenizer(TokenizersBackend):
     """
     Construct a Cohere tokenizer. Based on byte-level Byte-Pair-Encoding.
 
@@ -71,7 +71,7 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
 
     
 
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods.
 
     Args:
@@ -100,6 +100,10 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
             Whether or not the default system prompt for Cohere tokenizer should be used.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not the tokenizer should automatically add a prefix space
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -111,113 +115,104 @@ class CohereTokenizerFast(PreTrainedTokenizerFast):
 
     def __init__(
         self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        clean_up_tokenization_spaces=False,
-        unk_token="",
-        bos_token="",
-        eos_token="<|END_OF_TURN_TOKEN|>",
-        add_bos_token=True,
-        add_eos_token=False,
-        use_default_system_prompt=False,
-        add_prefix_space=False,
+        errors: str = "replace",
+        unk_token: str = "",
+        bos_token: str = "",
+        eos_token: str = "<|END_OF_TURN_TOKEN|>",
+        pad_token: str = "",
+        cls_token: str = "",
+        sep_token: str = "",
+        mask_token: str = "",
+        add_bos_token: bool = True,
+        add_eos_token: bool = False,
+        use_default_system_prompt: bool = False,
+        add_prefix_space: bool = False,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tokenizer_file=tokenizer_file,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            use_default_system_prompt=use_default_system_prompt,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
         self._add_bos_token = add_bos_token
         self._add_eos_token = add_eos_token
-        self.update_post_processor()
         self.use_default_system_prompt = use_default_system_prompt
-        self.vocab_file = vocab_file
+        self.add_prefix_space = add_prefix_space
         self.grounded_generation_template = kwargs.pop("grounded_generation_template", None)
         self.tool_use_template = kwargs.pop("tool_use_template", None)
 
-        # This is a `tokenizers.pre_tokenizers.Sequence`
-        for pre_tokenizer in self.backend_tokenizer.pre_tokenizer:
-            if hasattr(pre_tokenizer, "add_prefix_space"):
-                pre_tokenizer.add_prefix_space = add_prefix_space
-        self.backend_tokenizer.decoder.add_prefix_space = add_prefix_space
-
-        self.add_prefix_space = add_prefix_space
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        if not (self.add_prefix_space or not is_split_into_words):
-            raise Exception(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
-                " pretokenized inputs."
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
             )
+        else:
+            self._vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+                str(bos_token): 5,
+            }
 
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        if not (self.add_prefix_space or not is_split_into_words):
-            raise Exception(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
-                " pretokenized inputs."
+        if merges is not None:
+            self._merges = merges
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
             )
+        )
 
-        return super()._encode_plus(*args, **kwargs)
-
-    def update_post_processor(self):
-        """
-        Updates the underlying post processor with the current `bos_token` and `eos_token`.
-        """
-        bos = self.bos_token
-        bos_token_id = self.bos_token_id
-        if bos is None and self.add_bos_token:
-            raise ValueError("add_bos_token = True but bos_token = None")
-
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        if eos is None and self.add_eos_token:
-            raise ValueError("add_eos_token = True but eos_token = None")
-
-        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
-
-        special_tokens = []
-        if self.add_bos_token:
-            special_tokens.append((bos, bos_token_id))
-        if self.add_eos_token:
-            special_tokens.append((eos, eos_token_id))
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=single, pair=pair, special_tokens=special_tokens
+        self._tokenizer.normalizer = normalizers.NFC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space, trim_offsets=True),
+            ]
         )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=add_prefix_space, trim_offsets=True)
 
-    @property
-    def add_eos_token(self):
-        return self._add_eos_token
+        tokenizer_object = self._tokenizer
 
-    @property
-    def add_bos_token(self):
-        return self._add_bos_token
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            sep_token=sep_token,
+            mask_token=mask_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            use_default_system_prompt=use_default_system_prompt,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
 
-    @add_eos_token.setter
-    def add_eos_token(self, value):
-        self._add_eos_token = value
-        self.update_post_processor()
+        self._post_init()
+
+    def _post_init(self):
+        """Post-initialization to ensure add_prefix_space is applied correctly."""
+        # Re-apply add_prefix_space setting to pre_tokenizer and decoder
+        # This is needed because when loading from pretrained, the tokenizer.json
+        # has these settings baked in and we need to override them
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True),
+            ]
+        )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=self.add_prefix_space, trim_offsets=True)
 
-    @add_bos_token.setter
-    def add_bos_token(self, value):
-        self._add_bos_token = value
-        self.update_post_processor()
+        # Call parent to handle AddedToken properties
+        super()._post_init()
 
     def apply_tool_use_template(
         self,
@@ -285,8 +280,8 @@ def apply_tool_use_template(
         Examples:
 
         ```python
-        >> tokenizer = CohereTokenizerFast.from_pretrained("CohereForAI/c4ai-command-r-v01")
-        >> tools = [
+        tokenizer = CohereTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
+        tools = [
             {
                 "name": "internet_search",
                 "description": "Returns a list of relevant document snippets for a textual query retrieved from the internet",
@@ -294,64 +289,23 @@ def apply_tool_use_template(
                     "query": {
                         "description": "Query to search the internet with",
                         "type": "str",
-                        "required": True
+                        "required": True,
                     }
-                }
+                },
             },
             {
-                "name': "directly_answer",
+                "name": "directly_answer",
                 "description": "Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history",
-                "parameter_definitions": {}
-            }
+                "parameter_definitions": {},
+            },
         ]
-        >> conversation = [
-            {"role": "user", "content": "Whats the biggest penguin in the world?"}
+        conversation = [
+            {"role": "user", "content": "Whats the biggest penguin in the world?"},
         ]
-        >> # render the prompt, ready for user to inspect, or for input into the model:
-        >> prompt = tokenizer.apply_tool_use_template(conversation, tools=tools, tokenize=False, add_generation_prompt=True)
-        >> print(prompt)
-        <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
-        The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
-
-        # System Preamble
-        ## Basic Rules
-        You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
-
-        # User Preamble
-        ## Task and Context
-        You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
-
-        ## Style Guide
-        Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.
-
-        ## Available Tools
-        Here is a list of tools that you have available to you:
-
-        \\`\\`\\`python
-        def internet_search(query: str) -> list[Dict]:
-            \"\"\"Returns a list of relevant document snippets for a textual query retrieved from the internet
-
-            Args:
-                query (str): Query to search the internet with
-            \"\"\"
-            pass
-        \\`\\`\\`
-
-        \\`\\`\\`python
-        def directly_answer() -> list[Dict]:
-            \"\"\"Calls a standard (un-augmented) AI chatbot to generate a response given the conversation history
-            \"\"\"
-            pass
-        \\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Write 'Action:' followed by a json-formatted list of actions that you want to perform in order to produce a good response to the user's last input. You can use any of the supplied tools any number of times, but you should aim to execute the minimum number of necessary actions for the input. You should use the `directly-answer` tool if calling the other tools is unnecessary. The list of actions you want to call should be formatted as a list of json objects, for example:
-        \\`\\`\\`json
-        [
-            {
-                "tool_name": title of the tool in the specification,
-                "parameters": a dict of parameters to input into the tool as they are defined in the specs, or {} if it takes no parameters
-            }
-        ]\\`\\`\\`<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
-        ```
-        >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
+        # Render the prompt, ready for user to inspect, or for input into the model
+        prompt = tokenizer.apply_tool_use_template(conversation, tools=tools, tokenize=False, add_generation_prompt=True)
+        print(prompt)
+        >> inputs = tokenizer.encode(grounded_generation_prompt, add_special_tokens=False, return_tensors='pt')
         >> outputs = model.generate(inputs, max_new_tokens=128)
         >> print(tokenizer.decode(outputs[0]))
         Action: ```json
@@ -431,7 +385,7 @@ def apply_grounded_generation_template(
         Examples:
 
         ```python
-        >> tokenizer = CohereTokenizerFast.from_pretrained('CohereForAI/c4ai-command-r-v01')
+        >> tokenizer = CohereTokenizer.from_pretrained('CohereForAI/c4ai-command-r-v01')
 
         >> # define documents:
         >> documents = [
@@ -445,38 +399,10 @@ def apply_grounded_generation_template(
         >> # render the prompt, ready for user to inspect, or for input into the model:
         >> grounded_generation_prompt = tokenizer.apply_grounded_generation_template(conversation, documents=documents, tokenize=False, add_generation_prompt=True)
         >> print(grounded_generation_prompt)
-        <|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># Safety Preamble
-        The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral.
-
-        ## Basic Rules
-        You are a powerful conversational AI trained by Cohere to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions.
-
-        # User Preamble
-        ## Task and Context
-        You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging.
-
-        ## Style Guide
-        Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Whats the biggest penguin in the world?<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>
-        Document: 0
-        title: Tall penguins
-        text: Emperor penguins are the tallest.
-
-        Document: 1
-        title: Penguin habitats
-        text: Emperor penguins only live in Antarctica.
-        <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>Carefully perform the following instructions, in order, starting each with a new line.
-        Firstly, Decide which of the retrieved documents are relevant to the user's last input by writing 'Relevant Documents:' followed by comma-separated list of document numbers. If none are relevant, you should instead write 'None'.
-        Secondly, Decide which of the retrieved documents contain facts that should be cited in a good answer to the user's last input by writing 'Cited Documents:' followed a comma-separated list of document numbers. If you dont want to cite any of them, you should instead write 'None'.
-        Thirdly, Write 'Answer:' followed by a response to the user's last input in high quality natural english. Use the retrieved documents to help you. Do not insert any citations or grounding markup.
-        Finally, Write 'Grounded answer:' followed by a response to the user's last input in high quality natural english. Use the symbols  and  to indicate when a fact comes from a document in the search result, e.g my fact for a fact from document 0.<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'''
-        ```
         >> inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors='pt')
         >> outputs = model.generate(inputs, max_new_tokens=128)
         >> print(tokenizer.decode(outputs[0]))
-        Relevant Documents: 0,1
-        Cited Documents: 0,1
-        Answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres.
-        Grounded answer: The Emperor Penguin is the tallest or biggest penguin in the world. It is a bird that lives only in Antarctica and grows to a height of around 122 centimetres.
+        ```
         """
         return self.apply_chat_template(
             conversation,
@@ -486,17 +412,5 @@ def apply_grounded_generation_template(
             **kwargs,
         )
 
-    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
 
-__all__ = ["CohereTokenizerFast"]
+__all__ = ["CohereTokenizer"]
diff --git a/src/transformers/models/colqwen2/configuration_colqwen2.py b/src/transformers/models/colqwen2/configuration_colqwen2.py
index bc5256fa5ff2..b168f502b3de 100644
--- a/src/transformers/models/colqwen2/configuration_colqwen2.py
+++ b/src/transformers/models/colqwen2/configuration_colqwen2.py
@@ -80,6 +80,9 @@ def __init__(
                 f"Invalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got {type(vlm_config)}."
             )
 
+        if not hasattr(vlm_config, "vocab_size"):
+            vlm_config.vocab_size = vlm_config.get_text_config().vocab_size
+
         self.vlm_config = vlm_config
         self.embedding_dim = embedding_dim
         self.initializer_range = initializer_range
diff --git a/src/transformers/models/convbert/__init__.py b/src/transformers/models/convbert/__init__.py
index 20999ba510da..2bd6040cc696 100644
--- a/src/transformers/models/convbert/__init__.py
+++ b/src/transformers/models/convbert/__init__.py
@@ -18,10 +18,10 @@
 
 
 if TYPE_CHECKING:
+    from ..bert.tokenization_bert import BertTokenizer as ConvBertTokenizerFast
     from .configuration_convbert import *
     from .modeling_convbert import *
     from .tokenization_convbert import *
-    from .tokenization_convbert_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index b354b0eeae3d..cc5d967237f7 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
+# Copyright The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,470 +14,18 @@
 # limitations under the License.
 """Tokenization classes for ConvBERT."""
 
-import collections
-import os
-import unicodedata
-from typing import Optional
+from ...models.bert.tokenization_bert import BertTokenizer
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
 
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with bert-base-cased->YituTech/conv-bert-base, ConvBertTokenizer->BertTokenizer, BERT->ConvBERT
-class ConvBertTokenizer(PreTrainedTokenizer):
+class ConvBertTokenizer(BertTokenizer):
     r"""
-    Construct a ConvBERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original ConvBERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A ConvBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
+    Construct a ConvBERT tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.
 
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
+    This tokenizer inherits from [`BertTokenizer`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
     """
 
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+    pass
 
 
 __all__ = ["ConvBertTokenizer"]
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
deleted file mode 100644
index e328e7490f4d..000000000000
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for ConvBERT."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_convbert import ConvBertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with bert-base-cased->YituTech/conv-bert-base, Bert->ConvBert, BERT->ConvBERT
-class ConvBertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original ConvBERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = ConvBertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A ConvBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["ConvBertTokenizerFast"]
diff --git a/src/transformers/models/cpm/__init__.py b/src/transformers/models/cpm/__init__.py
index aaf4524671fd..16c8e646ab1b 100644
--- a/src/transformers/models/cpm/__init__.py
+++ b/src/transformers/models/cpm/__init__.py
@@ -19,7 +19,6 @@
 
 if TYPE_CHECKING:
     from .tokenization_cpm import *
-    from .tokenization_cpm_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index 5ecfedd0a614..e5560e0efe4d 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -21,7 +21,7 @@
 
 import sentencepiece as spm
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, PreTrainedTokenizer
 from ...utils import SPIECE_UNDERLINE, logging
 from ...utils.import_utils import requires
 
@@ -157,23 +157,19 @@ def __init__(
         self._pad_token_type_id = 3
 
     @property
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.vocab_size
     def vocab_size(self):
         return len(self.sp_model)
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_vocab
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__getstate__
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
         return state
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.__setstate__
     def __setstate__(self, d):
         self.__dict__ = d
 
@@ -184,7 +180,6 @@ def __setstate__(self, d):
         self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         self.sp_model.Load(self.vocab_file)
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.preprocess_text
     def preprocess_text(self, inputs):
         if self.remove_space:
             outputs = " ".join(inputs.strip().split())
@@ -200,7 +195,6 @@ def preprocess_text(self, inputs):
 
         return outputs
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._tokenize
     def _tokenize(self, text: str) -> list[str]:
         """Tokenize a string."""
         text = self.preprocess_text(text)
@@ -221,23 +215,19 @@ def _tokenize(self, text: str) -> list[str]:
 
         return new_pieces
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.sp_model.PieceToId(token)
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.sp_model.IdToPiece(index)
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.convert_tokens_to_string
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
         out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -263,7 +253,6 @@ def build_inputs_with_special_tokens(
             return token_ids_0 + sep + cls
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
     ) -> list[int]:
@@ -292,7 +281,6 @@ def get_special_tokens_mask(
             return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
         return ([0] * len(token_ids_0)) + [1, 1]
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -323,7 +311,6 @@ def create_token_type_ids_from_sequences(
             return len(token_ids_0 + sep) * [0] + cls_segment_id
         return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer.save_vocabulary
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 3e828ca9e0b5..4757a520b0d9 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -18,7 +18,7 @@
 from shutil import copyfile
 from typing import Optional
 
-from ...tokenization_utils_fast import AddedToken, PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import AddedToken, PreTrainedTokenizerFast
 from ...utils import logging
 
 
@@ -144,7 +144,6 @@ def __init__(
         self.jieba = rjieba
         self.translator = str.maketrans(" \n", "\u2582\u2583")
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -170,7 +169,6 @@ def build_inputs_with_special_tokens(
             return token_ids_0 + sep + cls
         return token_ids_0 + sep + token_ids_1 + sep + cls
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -201,7 +199,6 @@ def create_token_type_ids_from_sequences(
             return len(token_ids_0 + sep) * [0] + cls_segment_id
         return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
 
-    # Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.save_vocabulary
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
         if not self.can_save_slow_tokenizer:
             raise ValueError(
diff --git a/src/transformers/models/cpmant/tokenization_cpmant.py b/src/transformers/models/cpmant/tokenization_cpmant.py
index 38cd9f0c6a25..e620be4c67e0 100644
--- a/src/transformers/models/cpmant/tokenization_cpmant.py
+++ b/src/transformers/models/cpmant/tokenization_cpmant.py
@@ -24,7 +24,7 @@
 if is_rjieba_available():
     import rjieba
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -144,8 +144,16 @@ def __init__(
             line_token=line_token,
             space_token=space_token,
             padding_side=padding_side,
+            token_type_ids_pattern="all_zeros",
+            token_type_ids_include_special_tokens=True,
+            special_tokens_pattern="bos",
             **kwargs,
         )
+        for special_token in [space_token, line_token]:
+            token_id = self.added_tokens_encoder.pop(special_token, None)
+            if token_id is not None:
+                self._added_tokens_decoder.pop(token_id, None)
+        self._update_total_vocab_size()
 
     @property
     def bod_token_id(self):
@@ -222,51 +230,5 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                 index += 1
         return (vocab_file,)
 
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A CPMAnt sequence has the following format:
-
-        - single sequence: `[BOS] Sequence`.
-
-        Args:
-            token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
-            token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.
-
-        Returns:
-            `list[int]`: The model input with special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.bos_token_id] + token_ids_0
-        return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`): List of IDs.
-            token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
-        return [1] + ([0] * len(token_ids_0))
-
 
 __all__ = ["CpmAntTokenizer"]
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index 5b7935e6404d..cc1d6d3e5f10 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -15,12 +15,10 @@
 """Tokenization classes for Salesforce CTRL."""
 
 import json
-import os
-from typing import Optional
 
 import regex as re
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -136,7 +134,14 @@ def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
         merges = [tuple(merge.split()) for merge in merges]
         self.bpe_ranks = dict(zip(merges, range(len(merges))))
         self.cache = {}
-        super().__init__(unk_token=unk_token, **kwargs)
+        self.add_bpe_version_header = True
+        super().__init__(
+            unk_token=unk_token,
+            token_type_ids_pattern="all_zeros",
+            token_type_ids_include_special_tokens=True,
+            special_tokens_pattern="none",
+            **kwargs,
+        )
 
     @property
     def vocab_size(self):
@@ -212,35 +217,6 @@ def convert_tokens_to_string(self, tokens):
         out_string = " ".join(tokens).replace("@@ ", "").strip()
         return out_string
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
     # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
     #     filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens))
     #     tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens)
diff --git a/src/transformers/models/deberta/__init__.py b/src/transformers/models/deberta/__init__.py
index ac2dbc3af259..c7bc4d20ac9e 100644
--- a/src/transformers/models/deberta/__init__.py
+++ b/src/transformers/models/deberta/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_deberta import *
     from .modeling_deberta import *
     from .tokenization_deberta import *
-    from .tokenization_deberta_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index 74e958c8030b..4e554c6e0be9 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -12,66 +12,24 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization class for model DeBERTa."""
+"""Fast Tokenization class for model DeBERTa."""
 
-import json
-import os
-from typing import Optional
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-
-# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 
-class DebertaTokenizer(PreTrainedTokenizer):
+class DebertaTokenizer(TokenizersBackend):
     """
-    Construct a DeBERTa tokenizer. Based on byte-level Byte-Pair-Encoding.
+    Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
@@ -87,23 +45,25 @@ class DebertaTokenizer(PreTrainedTokenizer):
     [1, 20920, 232, 2]
     ```
 
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
 
     
 
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
 
     
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            The path to a tokenizer file to use instead of the vocab file.
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
@@ -129,9 +89,6 @@ class DebertaTokenizer(PreTrainedTokenizer):
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (Deberta tokenizer detect beginning of words by the preceding space).
-        add_bos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial <|endoftext|> to the input. This allows to treat the leading word just as
-            any other word.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -139,8 +96,9 @@ class DebertaTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
+        vocab_file=None,
+        vocab=None,
+        merges=None,
         errors="replace",
         bos_token="[CLS]",
         eos_token="[SEP]",
@@ -150,37 +108,50 @@ def __init__(
         pad_token="[PAD]",
         mask_token="[MASK]",
         add_prefix_space=False,
-        add_bos_token=False,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-        self.add_bos_token = add_bos_token
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
+        self.vocab_file = vocab_file
         self.add_prefix_space = add_prefix_space
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(unk_token): 0,
+                str(cls_token): 1,
+                str(sep_token): 2,
+                str(pad_token): 3,
+                str(mask_token): 4,
+            }
+
+        if merges is not None and isinstance(merges, list) and len(merges) > 0:
+            self._merges = [tuple(m) if isinstance(m, list) else m for m in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                unk_token=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        self._tokenizer.normalizer = None
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             errors=errors,
             bos_token=bos_token,
             eos_token=eos_token,
@@ -190,177 +161,44 @@ def __init__(
             pad_token=pad_token,
             mask_token=mask_token,
             add_prefix_space=add_prefix_space,
-            add_bos_token=add_bos_token,
             **kwargs,
         )
 
-    @property
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.encoder)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A DeBERTa sequence has the following format:
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.cls_token} $A {self.sep_token}",
+            pair=f"{self.cls_token} $A {self.sep_token} {self.sep_token} $B {self.sep_token}",
+            special_tokens=[
+                (self.cls_token, self.cls_token_id),
+                (self.sep_token, self.sep_token_id),
+            ],
+        )
 
-        - single sequence: [CLS] X [SEP]
-        - pair of sequences: [CLS] A [SEP] B [SEP]
+        self._post_init()
 
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
+    @property
+    def mask_token(self) -> str:
+        """
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+        having been set.
 
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the *[MASK]*.
         """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
+        if self._mask_token is None:
+            if self.verbose:
+                logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @mask_token.setter
+    def mask_token(self, value):
         """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        Overriding the default behavior of the mask token to have it eat the space before it.
         """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
 
 
 __all__ = ["DebertaTokenizer"]
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
deleted file mode 100644
index c2f2e6552d9d..000000000000
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Microsoft and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization class for model DeBERTa."""
-
-from typing import Optional
-
-from ...tokenization_utils_base import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_deberta import DebertaTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class DebertaTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" DeBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import DebertaTokenizerFast
-
-    >>> tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
-    >>> tokenizer("Hello world")["input_ids"]
-    [1, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [1, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
-    the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            The path to a tokenizer file to use instead of the vocab file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The end of sequence token.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (Deberta tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
-    slow_tokenizer_class = DebertaTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="[CLS]",
-        eos_token="[SEP]",
-        sep_token="[SEP]",
-        cls_token="[CLS]",
-        unk_token="[UNK]",
-        pad_token="[PAD]",
-        mask_token="[MASK]",
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-        self.add_bos_token = kwargs.pop("add_bos_token", False)
-
-    @property
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-
-        Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the *[MASK]*.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @mask_token.setter
-    def mask_token(self, value):
-        """
-        Overriding the default behavior of the mask token to have it eat the space before it.
-        """
-        # Mask token behave like a normal word, i.e. include the space before it
-        # So we set lstrip to True
-        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
-        self._mask_token = value
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A DeBERTa sequence has the following format:
-
-        - single sequence: [CLS] X [SEP]
-        - pair of sequences: [CLS] A [SEP] B [SEP]
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["DebertaTokenizerFast"]
diff --git a/src/transformers/models/deberta_v2/__init__.py b/src/transformers/models/deberta_v2/__init__.py
index 929b26e60ae0..7c8e16f4a9ff 100644
--- a/src/transformers/models/deberta_v2/__init__.py
+++ b/src/transformers/models/deberta_v2/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_deberta_v2 import *
     from .modeling_deberta_v2 import *
     from .tokenization_deberta_v2 import *
-    from .tokenization_deberta_v2_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index e192474c3dcd..0c15913d94b8 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -12,43 +12,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization class for model DeBERTa."""
+"""Tokenization class for model DeBERTa-v2."""
 
-import os
-import unicodedata
-from typing import Any, Optional
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import Unigram
 
-import sentencepiece as sp
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
+VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
+
 
-VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
+def _get_prepend_scheme(add_prefix_space: bool) -> str:
+    if add_prefix_space:
+        return "always"
+    else:
+        return "first"
 
 
-@requires(backends=("sentencepiece",))
-class DebertaV2Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+class DebertaV2Tokenizer(TokenizersBackend):
+    """
+    Construct a DeBERTa-v2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on Unigram tokenization.
+
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file (SentencePiece model file). Not used directly but kept for compatibility.
+        vocab (`list`, *optional*):
+            List of tuples (piece, score) for the vocabulary.
+        precompiled_charsmap (`bytes`, *optional*):
+            Precompiled character map for normalization.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
-            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
-            The end of sequence token. When building a sequence using special tokens, this is not the token that is
-            used for the end of sequence. The token used is the `sep_token`.
+        split_by_punct (`bool`, *optional*, defaults to `False`):
+            Whether to split by punctuation.
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token.
         unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -64,28 +70,20 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word.
+        unk_id (`int`, *optional*, defaults to index of `unk_token` in vocab):
+            The ID of the unknown token in the vocabulary.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
 
     def __init__(
         self,
-        vocab_file,
+        vocab_file=None,
+        vocab=None,
         do_lower_case=False,
         split_by_punct=False,
         bos_token="[CLS]",
@@ -95,405 +93,83 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        add_prefix_space=True,
+        unk_id=3,
         **kwargs,
-    ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.do_lower_case = do_lower_case
-        self.split_by_punct = split_by_punct
-        self.vocab_file = vocab_file
-        self._tokenizer = SPMTokenizer(
-            vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
-        )
-        unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
-        super().__init__(
-            do_lower_case=do_lower_case,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            split_by_punct=split_by_punct,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-        self._tokenizer.special_tokens = self.all_special_tokens
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    @property
-    def vocab(self):
-        return self._tokenizer.vocab
-
-    def get_vocab(self):
-        vocab = self.vocab.copy()
-        vocab.update(self.get_added_vocab())
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        if self.do_lower_case:
-            text = text.lower()
-        return self._tokenizer.tokenize(text)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self._tokenizer.spm.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self._tokenizer.spm.IdToPiece(index) if index < self.vocab_size else self.unk_token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        return self._tokenizer.decode(tokens)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A DeBERTa sequence has the following format:
-
-        - single sequence: [CLS] X [SEP]
-        - pair of sequences: [CLS] A [SEP] B [SEP]
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", False)
-        if is_split_into_words or add_prefix_space:
-            text = " " + text
-        return (text, kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        return self._tokenizer.save_pretrained(save_directory, filename_prefix=filename_prefix)
-
-
-class SPMTokenizer:
-    r"""
-    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-    """
-
-    def __init__(
-        self, vocab_file, special_tokens, split_by_punct=False, sp_model_kwargs: Optional[dict[str, Any]] = None
     ):
-        self.split_by_punct = split_by_punct
         self.vocab_file = vocab_file
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
-        if not os.path.exists(vocab_file):
-            raise FileNotFoundError(f"{vocab_file} does not exist!")
-        spm.load(vocab_file)
-        bpe_vocab_size = spm.GetPieceSize()
-        # Token map
-        #  0+1
-        #  1+1
-        #  2+1
-        self.vocab = {spm.IdToPiece(i): i for i in range(bpe_vocab_size)}
-        self.ids_to_tokens = [spm.IdToPiece(i) for i in range(bpe_vocab_size)]
-        # self.vocab['[PAD]'] = 0
-        # self.vocab['[CLS]'] = 1
-        # self.vocab['[SEP]'] = 2
-        # self.vocab['[UNK]'] = 3
-
-        self.spm = spm
-        self.special_tokens = special_tokens
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["spm"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.spm.Load(self.vocab_file)
-
-    def tokenize(self, text):
-        return self._encode_as_pieces(text)
-
-    def convert_ids_to_tokens(self, ids):
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
+        self.do_lower_case = do_lower_case
+        self.split_by_punct = split_by_punct
+        self.add_prefix_space = add_prefix_space
+
+        if vocab is None:
+            self._vocab = [
+                (str(pad_token), 0.0),
+                (str(unk_token), 0.0),
+                (str(bos_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(sep_token), 0.0),
+                (str(cls_token), 0.0),
+                (str(mask_token), 0.0),
+            ]
 
-    def decode(self, tokens, start=-1, end=-1, raw_text=None):
-        if raw_text is None:
-            current_sub_tokens = []
-            out_string = ""
-            prev_is_special = False
-            for token in tokens:
-                # make sure that special tokens are not decoded using sentencepiece model
-                if token in self.special_tokens:
-                    if not prev_is_special:
-                        out_string += " "
-                    out_string += self.spm.decode_pieces(current_sub_tokens) + token
-                    prev_is_special = True
-                    current_sub_tokens = []
-                else:
-                    current_sub_tokens.append(token)
-                    prev_is_special = False
-            out_string += self.spm.decode_pieces(current_sub_tokens)
-            return out_string.strip()
         else:
-            words = self.split_to_words(raw_text)
-            word_tokens = [self.tokenize(w) for w in words]
-            token2words = [0] * len(tokens)
-            tid = 0
-            for i, w in enumerate(word_tokens):
-                for k, t in enumerate(w):
-                    token2words[tid] = i
-                    tid += 1
-            word_start = token2words[start]
-            word_end = token2words[end] if end < len(tokens) else len(words)
-            text = "".join(words[word_start:word_end])
-            return text
-
-    # TODO add a deprecation cycle as this can have different behaviour from our API
-    def add_special_token(self, token):
-        if token not in self.special_tokens:
-            self.special_tokens.append(token)
-            if token not in self.vocab:
-                self.vocab[token] = len(self.vocab) - 1
-                self.ids_to_tokens.append(token)
-        return self.id(token)
-
-    def part_of_whole_word(self, token, is_bos=False):
-        logger.warning_once(
-            "The `DebertaTokenizer.part_of_whole_word` method is deprecated and will be removed in `transformers==4.35`"
+            self._vocab = [tuple(item) if not isinstance(item, tuple) else item for item in vocab]
+            computed_unk_id = {piece: i for i, (piece, _score) in enumerate(self._vocab)}
+            unk_id = computed_unk_id.get(str(unk_token))
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab,
+                unk_id=unk_id,
+                byte_fallback=False,
+            )
         )
-        if is_bos:
-            return True
-        if (
-            len(token) == 1
-            and (_is_whitespace(list(token)[0]) or _is_control(list(token)[0]) or _is_punctuation(list(token)[0]))
-        ) or token in self.special_tokens:
-            return False
-
-        word_start = b"\xe2\x96\x81".decode("utf-8")
-        return not token.startswith(word_start)
-
-    def pad(self):
-        return "[PAD]"
-
-    def bos(self):
-        return "[CLS]"
-
-    def eos(self):
-        return "[SEP]"
-
-    def unk(self):
-        return "[UNK]"
-
-    def mask(self):
-        return "[MASK]"
 
-    def sym(self, id):
-        return self.ids_to_tokens[id]
-
-    def id(self, sym):
-        logger.warning_once(
-            "The `DebertaTokenizer.id` method is deprecated and will be removed in `transformers==4.35`"
+        list_normalizers = []
+        if do_lower_case:
+            list_normalizers.append(normalizers.Lowercase())
+
+        list_normalizers.extend(
+            [
+                normalizers.Replace("\n", " "),
+                normalizers.Replace("\r", " "),
+                normalizers.Replace("\t", " "),
+                normalizers.Replace(Regex(r" {2,}"), " "),
+                normalizers.NFC(),
+                normalizers.Strip(left=False, right=True),
+            ]
         )
-        return self.vocab.get(sym, 1)
-
-    def _encode_as_pieces(self, text):
-        text = convert_to_unicode(text)
-        if self.split_by_punct:
-            words = self._run_split_on_punc(text)
-            pieces = [self.spm.encode(w, out_type=str) for w in words]
-            return [p for w in pieces for p in w]
-        else:
-            return self.spm.encode(text, out_type=str)
-
-    def split_to_words(self, text):
-        pieces = self._encode_as_pieces(text)
-        word_start = b"\xe2\x96\x81".decode("utf-8")
-        words = []
-        offset = 0
-        prev_end = 0
-        for i, p in enumerate(pieces):
-            if p.startswith(word_start):
-                if offset > prev_end:
-                    words.append(text[prev_end:offset])
-                prev_end = offset
-                w = p.replace(word_start, "")
-            else:
-                w = p
-            try:
-                s = text.index(w, offset)
-                pn = ""
-                k = i + 1
-                while k < len(pieces):
-                    pn = pieces[k].replace(word_start, "")
-                    if len(pn) > 0:
-                        break
-                    k += 1
-
-                if len(pn) > 0 and pn in text[offset:s]:
-                    offset = offset + 1
-                else:
-                    offset = s + len(w)
-            except Exception:
-                offset = offset + 1
-
-        if prev_end < offset:
-            words.append(text[prev_end:offset])
-
-        return words
+        self._tokenizer.normalizer = normalizers.Sequence(list_normalizers)
 
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
+        list_pretokenizers = []
+        if split_by_punct:
+            list_pretokenizers.append(pre_tokenizers.Punctuation(behavior="isolated"))
 
-        return ["".join(x) for x in output]
+        prepend_scheme = _get_prepend_scheme(add_prefix_space)
+        list_pretokenizers.append(pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme))
 
-    def save_pretrained(self, path: str, filename_prefix: Optional[str] = None):
-        filename = VOCAB_FILES_NAMES[list(VOCAB_FILES_NAMES.keys())[0]]
-        if filename_prefix is not None:
-            filename = filename_prefix + "-" + filename
-        full_path = os.path.join(path, filename)
-        with open(full_path, "wb") as fs:
-            fs.write(self.spm.serialized_model_proto())
-        return (full_path,)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(list_pretokenizers)
 
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
 
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically control characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
+        tokenizer_object = self._tokenizer
 
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
-
-
-def convert_to_unicode(text):
-    """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
-    if isinstance(text, str):
-        return text
-    elif isinstance(text, bytes):
-        return text.decode("utf-8", "ignore")
-    else:
-        raise TypeError(f"Unsupported string type: {type(text)}")
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            unk_id=unk_id,
+            do_lower_case=do_lower_case,
+            split_by_punct=split_by_punct,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
 
 
 __all__ = ["DebertaV2Tokenizer"]
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
deleted file mode 100644
index 1bad9684dd84..000000000000
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Microsoft and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization class for model DeBERTa."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...file_utils import is_sentencepiece_available
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_deberta_v2 import DebertaV2Tokenizer
-else:
-    DebertaV2Tokenizer = None
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "spm.model", "tokenizer_file": "tokenizer.json"}
-
-
-class DebertaV2TokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Constructs a DeBERTa-v2 fast tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `False`):
-            Whether or not to lowercase the input when tokenizing.
-        bos_token (`string`, *optional*, defaults to `"[CLS]"`):
-            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-        eos_token (`string`, *optional*, defaults to `"[SEP]"`):
-            The end of sequence token. When building a sequence using special tokens, this is not the token that is
-            used for the end of sequence. The token used is the `sep_token`.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = DebertaV2Tokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=False,
-        split_by_punct=False,
-        bos_token="[CLS]",
-        eos_token="[SEP]",
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs,
-    ) -> None:
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            split_by_punct=split_by_punct,
-            **kwargs,
-        )
-
-        self.do_lower_case = do_lower_case
-        self.split_by_punct = split_by_punct
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A DeBERTa sequence has the following format:
-
-        - single sequence: [CLS] X [SEP]
-        - pair of sequences: [CLS] A [SEP] B [SEP]
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["DebertaV2TokenizerFast"]
diff --git a/src/transformers/models/dia/tokenization_dia.py b/src/transformers/models/dia/tokenization_dia.py
index 4e205906ea70..6bc567889791 100644
--- a/src/transformers/models/dia/tokenization_dia.py
+++ b/src/transformers/models/dia/tokenization_dia.py
@@ -16,7 +16,7 @@
 
 from typing import Optional
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -63,6 +63,10 @@ def __init__(
             unk_token=unk_token,
             pad_token=pad_token,
             max_length=max_length,
+            offset=offset,
+            token_type_ids_pattern="all_zeros",
+            token_type_ids_include_special_tokens=True,
+            special_tokens_pattern="none",
             **kwargs,
         )
 
@@ -110,9 +114,5 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
         string = bstring.decode("utf-8", errors="ignore")
         return string
 
-    # No vocab file
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        return ()
-
 
 __all__ = ["DiaTokenizer"]
diff --git a/src/transformers/models/distilbert/__init__.py b/src/transformers/models/distilbert/__init__.py
index 094524ab267f..0036fba6ed7f 100644
--- a/src/transformers/models/distilbert/__init__.py
+++ b/src/transformers/models/distilbert/__init__.py
@@ -20,8 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_distilbert import *
     from .modeling_distilbert import *
-    from .tokenization_distilbert import *
-    from .tokenization_distilbert_fast import *
+    from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
 else:
     import sys
 
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 4e44468ab1d5..de3271c8cbc7 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -14,479 +14,17 @@
 # limitations under the License.
 """Tokenization classes for DistilBERT."""
 
-import collections
-import os
-import unicodedata
-from typing import Optional
+from ...models.bert.tokenization_bert import BertTokenizer
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
 
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
 
-logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class DistilBertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a DistilBERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
+class DistilBertTokenizer(BertTokenizer):
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = DistilBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.vocab)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
 
+# DistilBertTokenizerFast is an alias for DistilBertTokenizer (since BertTokenizer is already a fast tokenizer)
+DistilBertTokenizerFast = DistilBertTokenizer
 
-__all__ = ["DistilBertTokenizer"]
+__all__ = ["DistilBertTokenizer", "DistilBertTokenizerFast"]
diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
deleted file mode 100644
index c174804dc530..000000000000
--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for DistilBERT."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_distilbert import DistilBertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class DistilBertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = DistilBertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["DistilBertTokenizerFast"]
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 5f501dbdd4f0..968a579267d2 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -19,7 +19,7 @@
 
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType, add_end_docstrings, add_start_docstrings, logging
-from ..bert.tokenization_bert_fast import BertTokenizerFast
+from ..bert.tokenization_bert import BertTokenizer
 from .tokenization_dpr import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRReaderTokenizer
 
 
@@ -28,28 +28,28 @@
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
 
 
-class DPRContextEncoderTokenizerFast(BertTokenizerFast):
+class DPRContextEncoderTokenizerFast(BertTokenizer):
     r"""
     Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizer`] and runs end-to-end tokenization:
     punctuation splitting and wordpiece.
 
-    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     slow_tokenizer_class = DPRContextEncoderTokenizer
 
 
-class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
+class DPRQuestionEncoderTokenizerFast(BertTokenizer):
     r"""
     Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizer`] and runs end-to-end tokenization:
     punctuation splitting and wordpiece.
 
-    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -300,15 +300,15 @@ def _get_best_spans(
 
 
 @add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
-class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
+class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizer):
     r"""
     Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizerFast`] and runs end-to-end tokenization:
+    [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizer`] and runs end-to-end tokenization:
     punctuation splitting and wordpiece. The difference is that is has three inputs strings: question, titles and texts
     that are combined to be fed to the [`DPRReader`] model.
 
-    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning parameters.
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
 
     """
 
diff --git a/src/transformers/models/electra/__init__.py b/src/transformers/models/electra/__init__.py
index 506212b561e1..200b5e4106a0 100644
--- a/src/transformers/models/electra/__init__.py
+++ b/src/transformers/models/electra/__init__.py
@@ -18,10 +18,9 @@
 
 
 if TYPE_CHECKING:
+    from ..bert.tokenization_bert import BertTokenizer as ElectraTokenizer
     from .configuration_electra import *
     from .modeling_electra import *
-    from .tokenization_electra import *
-    from .tokenization_electra_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py
deleted file mode 100644
index db0285581ed1..000000000000
--- a/src/transformers/models/electra/tokenization_electra_fast.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from .tokenization_electra import ElectraTokenizer
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->Electra , BERT->ELECTRA
-class ElectraTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original ELECTRA).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = ElectraTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A ELECTRA sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["ElectraTokenizerFast"]
diff --git a/src/transformers/models/esm/tokenization_esm.py b/src/transformers/models/esm/tokenization_esm.py
index 7d9705f7dbd3..c926a0adec6a 100644
--- a/src/transformers/models/esm/tokenization_esm.py
+++ b/src/transformers/models/esm/tokenization_esm.py
@@ -17,7 +17,7 @@
 import os
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 807bd294c406..7ccb482597a1 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -65,7 +65,7 @@ def process_proteins(self, proteins, protein_max_length=1024):
             sa_sequence = "".join([s.upper() + f.lower() for s, f in zip(aa_seq, foldseek)])
             sa_sequences.append(sa_sequence)
 
-        sa_tokens = self.protein_tokenizer.batch_encode_plus(
+        sa_tokens = self.protein_tokenizer(
             sa_sequences, return_tensors="pt", truncation=True, max_length=protein_max_length, padding=True
         )
         return sa_tokens
diff --git a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
index 004a1c36f59c..9a3cb9b366f6 100644
--- a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
+++ b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py
@@ -20,7 +20,7 @@
 
 import regex
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging, requires_backends
 
 
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index dee653450eba..b5aba624a531 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -20,7 +20,7 @@
 import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/fnet/__init__.py b/src/transformers/models/fnet/__init__.py
index 756d690e72c1..00785b5e5075 100644
--- a/src/transformers/models/fnet/__init__.py
+++ b/src/transformers/models/fnet/__init__.py
@@ -20,8 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_fnet import *
     from .modeling_fnet import *
-    from .tokenization_fnet import *
-    from .tokenization_fnet_fast import *
+    from .tokenization_fnet import FNetTokenizer, FNetTokenizerFast
 else:
     import sys
 
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 72aa202612e0..fa40ff69245b 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Google Research, Google AI, Google Brain and the HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,43 +14,29 @@
 # limitations under the License.
 """Tokenization classes for FNet model."""
 
-import os
-import unicodedata
-from shutil import copyfile
-from typing import Any, Optional
-
-import sentencepiece as spm
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
-from ...utils.import_utils import requires
+from ..albert.tokenization_albert import AlbertTokenizer
 
 
 logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
 
-SPIECE_UNDERLINE = "▁"
 
-
-@requires(backends=("sentencepiece",))
-class FNetTokenizer(PreTrainedTokenizer):
+class FNetTokenizer(AlbertTokenizer):
     """
-    Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from [`PreTrainedTokenizer`]
-    which contains most of the main methods. Users should refer to this superclass for more information regarding those
-    methods.
+    Construct an FNet tokenizer. Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
+
+    This tokenizer inherits from [`AlbertTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `False`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (`bool`, *optional*, defaults to `True`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token.
         unk_token (`str`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -66,249 +52,12 @@ class FNetTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
-    vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "token_type_ids"]
 
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=False,
-        remove_space=True,
-        keep_accents=True,
-        unk_token="",
-        sep_token="[SEP]",
-        pad_token="",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
-        **kwargs,
-    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it and
-        # is included in the raw text, there should be a match in a non-normalized sentence.
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            **kwargs,
-        )
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-        outputs = outputs.replace("``", '"').replace("''", '"')
-
-        if not self.keep_accents:
-            outputs = unicodedata.normalize("NFKD", outputs)
-            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Tokenize a string."""
-        text = self.preprocess_text(text)
-        pieces = self.sp_model.encode(text, out_type=str)
-        new_pieces = []
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
-                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def _decode(
-        self,
-        token_ids: list[int],
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        text = super()._decode(
-            token_ids=token_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            **kwargs,
-        )
-        # Mimic the behavior of the Rust tokenizer:
-        # No space after 
-        if not spaces_between_special_tokens:
-            text = text.replace(" ", "")
-        return text
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An FNet sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
 
+# FNetTokenizerFast is an alias for FNetTokenizer (since AlbertTokenizer is already a fast tokenizer)
+FNetTokenizerFast = FNetTokenizer
 
-__all__ = ["FNetTokenizer"]
+__all__ = ["FNetTokenizer", "FNetTokenizerFast"]
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
deleted file mode 100644
index 4aab7997650f..000000000000
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for FNet model."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_fnet import FNetTokenizer
-else:
-    FNetTokenizer = None
-
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-SPIECE_UNDERLINE = "▁"
-
-
-class FNetTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
-    [`AlbertTokenizerFast`]. Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
-    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `False`):
-            Whether or not to lowercase the input when tokenizing.
-        remove_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (`bool`, *optional*, defaults to `True`):
-            Whether or not to keep accents when tokenizing.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "token_type_ids"]
-    slow_tokenizer_class = FNetTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=False,
-        remove_space=True,
-        keep_accents=True,
-        unk_token="",
-        sep_token="[SEP]",
-        pad_token="",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it and
-        # is included in the raw text, there should be a match in a non-normalized sentence.
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An FNet sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["FNetTokenizerFast"]
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index 5a4446d8e90b..1a4215cf2ad4 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -20,7 +20,7 @@
 import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/funnel/__init__.py b/src/transformers/models/funnel/__init__.py
index 1a75ee7e2e61..e55e71fe491e 100644
--- a/src/transformers/models/funnel/__init__.py
+++ b/src/transformers/models/funnel/__init__.py
@@ -22,7 +22,6 @@
     from .convert_funnel_original_tf_checkpoint_to_pytorch import *
     from .modeling_funnel import *
     from .tokenization_funnel import *
-    from .tokenization_funnel_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index e5d44e5e5906..7d6e36dfa0da 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -14,12 +14,12 @@
 # limitations under the License.
 """Tokenization class for Funnel Transformer."""
 
-import collections
-import os
-import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import WordPiece
+
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
@@ -41,45 +41,18 @@
 ]
 
 
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class FunnelTokenizer(PreTrainedTokenizer):
+class FunnelTokenizer(TokenizersBackend):
     r"""
-    Construct a Funnel Transformer tokenizer. Based on WordPiece.
+    Construct a Funnel Transformer tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
         vocab_file (`str`):
             File containing the vocabulary.
         do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
         unk_token (`str`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -95,65 +68,94 @@ class FunnelTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `""`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        bos_token (`str`, *optional*, defaults to `""`):
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        bos_token (`str`, `optional`, defaults to `""`):
             The beginning of sentence token.
-        eos_token (`str`, *optional*, defaults to `""`):
+        eos_token (`str`, `optional`, defaults to `""`):
             The end of sentence token.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = None
     cls_token_type_id: int = 2
 
     def __init__(
         self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="",
-        sep_token="",
-        pad_token="",
-        cls_token="",
-        mask_token="",
-        bos_token="",
-        eos_token="",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
+        do_lower_case: bool = True,
+        unk_token: str = "",
+        sep_token: str = "",
+        pad_token: str = "",
+        cls_token: str = "",
+        mask_token: str = "",
+        bos_token: str = "",
+        eos_token: str = "",
+        clean_text: bool = True,
+        tokenize_chinese_chars: bool = True,
+        strip_accents: Optional[bool] = None,
+        wordpieces_prefix: str = "##",
+        vocab: Optional[dict] = None,
+        vocab_file: Optional[str] = None,
         **kwargs,
     ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = FunnelTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
+        self.vocab_file = vocab_file
+        self.do_lower_case = do_lower_case
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.clean_text = clean_text
+        self.wordpieces_prefix = wordpieces_prefix
+
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        else:
+            self._vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+                str(bos_token): 5,
+                str(eos_token): 6,
+            }
+
+        self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
+
+        self._tokenizer.normalizer = normalizers.BertNormalizer(
+            clean_text=clean_text,
+            handle_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=do_lower_case,
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+        self._tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls_token}:2 $A:0 {sep_token}:0",  # token_type_id is 2 for Funnel transformer
+            pair=f"{cls_token}:2 $A:0 {sep_token}:0 $B:1 {sep_token}:1",
+            special_tokens=[
+                (str(cls_token), self._vocab.get(str(cls_token), 2)),
+                (str(sep_token), self._vocab.get(str(sep_token), 3)),
+            ],
+        )
+
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
             unk_token=unk_token,
             sep_token=sep_token,
             pad_token=pad_token,
@@ -161,382 +163,12 @@ def __init__(
             mask_token=mask_token,
             bos_token=bos_token,
             eos_token=eos_token,
+            clean_text=clean_text,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            wordpieces_prefix=wordpieces_prefix,
             **kwargs,
         )
 
-    @property
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.do_lower_case
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.vocab)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
-        Transformer sequence pair mask has the following format:
-
-        ```
-        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
-        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
 
 __all__ = ["FunnelTokenizer"]
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
deleted file mode 100644
index eeeb6f7bf6cb..000000000000
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for Funnel Transformer."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_funnel import FunnelTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-_model_names = [
-    "small",
-    "small-base",
-    "medium",
-    "medium-base",
-    "intermediate",
-    "intermediate-base",
-    "large",
-    "large-base",
-    "xlarge",
-    "xlarge-base",
-]
-
-
-class FunnelTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        bos_token (`str`, `optional`, defaults to `""`):
-            The beginning of sentence token.
-        eos_token (`str`, `optional`, defaults to `""`):
-            The end of sentence token.
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = FunnelTokenizer
-    cls_token_type_id: int = 2
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="",
-        sep_token="",
-        pad_token="",
-        cls_token="",
-        mask_token="",
-        bos_token="",
-        eos_token="",
-        clean_text=True,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        wordpieces_prefix="##",
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            clean_text=clean_text,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            wordpieces_prefix=wordpieces_prefix,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.build_inputs_with_special_tokens with BERT->Funnel
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A Funnel sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
-        Transformer sequence pair mask has the following format:
-
-        ```
-        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0]
-        return len(cls) * [self.cls_token_type_id] + len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    # Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["FunnelTokenizerFast"]
diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py
index a6b6b7684f8a..cc1ae2991a68 100644
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@@ -13,9 +13,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import TYPE_CHECKING, Any, Optional
+from typing import TYPE_CHECKING, Optional
 
-import sentencepiece as spm
 import torch
 from torch import nn
 
@@ -27,7 +26,6 @@
 from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params
 from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import TransformersKwargs, logging
 from ..llama.modeling_llama import (
     LlamaAttention,
@@ -39,11 +37,10 @@
     LlamaPreTrainedModel,
     LlamaRotaryEmbedding,
 )
-from ..llama.tokenization_llama import LlamaTokenizer
 
 
 if TYPE_CHECKING:
-    from ...tokenization_utils_base import TextInput
+    pass
 
 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
 
@@ -198,162 +195,6 @@ def __init__(
         )
 
 
-class GemmaTokenizer(LlamaTokenizer, PreTrainedTokenizer):
-    """
-    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
-    no padding token in the original model.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The end of sequence token.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
-            attention mechanisms or loss computation.
-        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-        add_bos_token (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an `bos_token` at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an `eos_token` at the end of sequences.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not the default system prompt for Gemma should be used.
-        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not to add spaces between special tokens.
-    """
-
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        pad_token="",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
-        add_bos_token=True,
-        add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        use_default_system_prompt=False,
-        spaces_between_special_tokens=False,
-        **kwargs,
-    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
-
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self.use_default_system_prompt = use_default_system_prompt
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        PreTrainedTokenizer.__init__(
-            self,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            sp_model_kwargs=sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            use_default_system_prompt=use_default_system_prompt,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            **kwargs,
-        )
-
-    def get_spm_processor(self):
-        raise AttributeError("Not needed for Gemma")
-
-    def unk_token_length(self):
-        raise AttributeError("Not needed for Gemma")
-
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Args:
-            text: TextInput
-        Simply calls PreTrainedTokenizer's method
-        """
-        return PreTrainedTokenizer.tokenize(self, text, **kwargs)
-
-    def _tokenize(self, text, **kwargs):
-        """
-        Args:
-            text: TextInput
-        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
-        """
-        return self.sp_model.encode(text, out_type=str)
-
-    def _decode(
-        self,
-        token_ids: list[int],
-        skip_special_tokens: bool = False,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        sub_texts = []
-        current_sub_text = []
-        for ids in token_ids:
-            if skip_special_tokens and ids in self.all_special_ids:
-                continue
-            if ids in self._added_tokens_decoder:
-                if current_sub_text:
-                    sub_texts.append(self.sp_model.decode(current_sub_text))
-                sub_texts.append(self._added_tokens_decoder[ids].content)
-                current_sub_text = []
-            else:
-                current_sub_text.append(ids)
-        if current_sub_text:
-            sub_texts.append(self.sp_model.decode(current_sub_text))
-
-        if spaces_between_special_tokens:
-            sub_texts = " ".join(sub_texts)
-        else:
-            sub_texts = "".join(sub_texts)
-
-        return sub_texts.replace(SPIECE_UNDERLINE, " ")
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self._added_tokens_encoder:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-
 class GemmaRMSNorm(nn.Module):
     def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
@@ -504,7 +345,6 @@ class GemmaForTokenClassification(LlamaForTokenClassification):
 
 __all__ = [
     "GemmaConfig",
-    "GemmaTokenizer",
     "GemmaModel",
     "GemmaForCausalLM",
     "GemmaForSequenceClassification",
diff --git a/src/transformers/models/gemma/tokenization_gemma.py b/src/transformers/models/gemma/tokenization_gemma.py
index 3320968c2915..aa2388e9a9c0 100644
--- a/src/transformers/models/gemma/tokenization_gemma.py
+++ b/src/transformers/models/gemma/tokenization_gemma.py
@@ -1,12 +1,5 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/gemma/modular_gemma.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_gemma.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
-# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved.
-#
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,317 +12,119 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-from shutil import copyfile
-from typing import TYPE_CHECKING, Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import generate_merges
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
-
 
-if TYPE_CHECKING:
-    from ...tokenization_utils_base import TextInput
 
 logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
-
-SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
 
 
-@requires(backends=("sentencepiece",))
-class GemmaTokenizer(PreTrainedTokenizer):
+class GemmaTokenizer(TokenizersBackend):
     """
-    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
-    no padding token in the original model.
+    Construct a fast Gemma tokenizer (backed by HuggingFace's tokenizers library).
+
+    This tokenizer uses a Unigram model with ByteFallback, no prefix space, and a normalizer that replaces
+    spaces with "▁".
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
+        tokenizer_file (`str`, optional):
+            A tokenizers JSON file containing the serialization of a tokenizer.
+        unk_token (`str`, optional, defaults to ""):
+            The unknown token.
+        bos_token (`str`, optional, defaults to ""):
+            The beginning of sequence token.
+        eos_token (`str`, optional, defaults to ""):
             The end of sequence token.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
-            attention mechanisms or loss computation.
-        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-        add_bos_token (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an `bos_token` at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
+        pad_token (`str`, optional, defaults to ""):
+            The padding token.
+        mask_token (`str`, optional, defaults to ""):
+            The mask token.
+        add_bos_token (`bool`, optional, defaults to True):
+            Whether or not to add a `bos_token` at the start of sequences.
+        add_eos_token (`bool`, optional, defaults to False):
             Whether or not to add an `eos_token` at the end of sequences.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not the default system prompt for Gemma should be used.
-        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not to add spaces between special tokens.
+        vocab (`dict`, optional):
+            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = None
+    padding_side = "left"
     model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,
-        vocab_file,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        pad_token="",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
-        add_bos_token=True,
-        add_eos_token=False,
-        clean_up_tokenization_spaces=False,
-        use_default_system_prompt=False,
-        spaces_between_special_tokens=False,
+        unk_token: str = "",
+        bos_token: str = "",
+        eos_token: str = "",
+        pad_token: str = "",
+        mask_token: str = "",
+        add_bos_token: bool = True,
+        add_eos_token: bool = False,
+        vocab: Optional[dict] = None,
+        merges: Optional[list[tuple[str, str]]] = None,
         **kwargs,
     ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
 
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self.use_default_system_prompt = use_default_system_prompt
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+        special_tokens = {str(pad_token), str(eos_token), str(bos_token), str(unk_token)}
+
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(pad_token): 0,
+                str(eos_token): 1,
+                str(bos_token): 2,
+                str(unk_token): 3,
+                str(mask_token): 4,
+            }
+
+        filtered_vocab = {t: i for t, i in self._vocab.items() if t not in special_tokens}
+        self._merges = merges if merges is not None else generate_merges(filtered_vocab)
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                fuse_unk=True,
+                unk_token=str(unk_token),
+                dropout=None,
+                byte_fallback=True,
+            )
+        )
+
+        self._tokenizer.decoder = decoders.Sequence(
+            [decoders.Replace("▁", " "), decoders.ByteFallback(), decoders.Fuse()]
+        )
+        self._tokenizer.normalizer = normalizers.Replace(" ", "▁")
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Split(" ", "merged_with_previous")
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             pad_token=pad_token,
+            mask_token=mask_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
-            sp_model_kwargs=sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            use_default_system_prompt=use_default_system_prompt,
-            spaces_between_special_tokens=spaces_between_special_tokens,
             **kwargs,
         )
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Args:
-            text: TextInput
-        Simply calls PreTrainedTokenizer's method
-        """
-        return super().tokenize(text, **kwargs)
-
-    def _tokenize(self, text, **kwargs):
-        """
-        Args:
-            text: TextInput
-        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
-        """
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self._added_tokens_encoder:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
-
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
-
-        return output
-
-    def _decode(
-        self,
-        token_ids: list[int],
-        skip_special_tokens: bool = False,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        sub_texts = []
-        current_sub_text = []
-        for ids in token_ids:
-            if skip_special_tokens and ids in self.all_special_ids:
-                continue
-            if ids in self._added_tokens_decoder:
-                if current_sub_text:
-                    sub_texts.append(self.sp_model.decode(current_sub_text))
-                sub_texts.append(self._added_tokens_decoder[ids].content)
-                current_sub_text = []
-            else:
-                current_sub_text.append(ids)
-        if current_sub_text:
-            sub_texts.append(self.sp_model.decode(current_sub_text))
-
-        if spaces_between_special_tokens:
-            sub_texts = " ".join(sub_texts)
-        else:
-            sub_texts = "".join(sub_texts)
-
-        return sub_texts.replace(SPIECE_UNDERLINE, " ")
+    def _unk_id(self) -> int:
+        # Align with historical Gemma convention: pad, eos, bos, unk
+        return 3
 
 
 __all__ = ["GemmaTokenizer"]
diff --git a/src/transformers/models/gemma/tokenization_gemma_fast.py b/src/transformers/models/gemma/tokenization_gemma_fast.py
deleted file mode 100644
index 9fc6e3d3593b..000000000000
--- a/src/transformers/models/gemma/tokenization_gemma_fast.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_gemma import GemmaTokenizer
-else:
-    GemmaTokenizer = None
-
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
-
-
-class GemmaTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.
-
-    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`
-
-    ```python
-    >>> from transformers import GemmaTokenizerFast
-
-    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
-    >>> tokenizer.encode("Hello this is a test")
-    [2, 4521, 736, 603, 476, 2121]
-    ```
-
-    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
-    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
-    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
-    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
-
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        tokenizer_file (`str`, *optional*):
-            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The end of sequence token.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The padding token
-        add_bos_token (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an `bos_token` at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an `eos_token` at the end of sequences.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = GemmaTokenizer
-    padding_side = "left"
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        clean_up_tokenization_spaces=False,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        pad_token="",
-        add_bos_token=True,
-        add_eos_token=False,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            **kwargs,
-        )
-        self._add_bos_token = add_bos_token
-        self._add_eos_token = add_eos_token
-        self.update_post_processor()
-        self.vocab_file = vocab_file
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
-    def update_post_processor(self):
-        """
-        Updates the underlying post processor with the current `bos_token` and `eos_token`.
-        """
-        bos = self.bos_token
-        bos_token_id = self.bos_token_id
-        if bos is None and self.add_bos_token:
-            raise ValueError("add_bos_token = True but bos_token = None")
-
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        if eos is None and self.add_eos_token:
-            raise ValueError("add_eos_token = True but eos_token = None")
-
-        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
-
-        special_tokens = []
-        if self.add_bos_token:
-            special_tokens.append((bos, bos_token_id))
-        if self.add_eos_token:
-            special_tokens.append((eos, eos_token_id))
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=single, pair=pair, special_tokens=special_tokens
-        )
-
-    @property
-    def add_eos_token(self):
-        return self._add_eos_token
-
-    @property
-    def add_bos_token(self):
-        return self._add_bos_token
-
-    @add_eos_token.setter
-    def add_eos_token(self, value):
-        self._add_eos_token = value
-        self.update_post_processor()
-
-    @add_bos_token.setter
-    def add_bos_token(self, value):
-        self._add_bos_token = value
-        self.update_post_processor()
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-
-__all__ = ["GemmaTokenizerFast"]
diff --git a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
index 1556780c59f9..aa7130ece3bb 100644
--- a/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
+++ b/src/transformers/models/got_ocr2/convert_got_ocr2_weights_to_hf.py
@@ -32,7 +32,7 @@
     is_vision_available,
 )
 from transformers.convert_slow_tokenizer import TikTokenConverter
-from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_python import AddedToken
 
 
 if is_vision_available():
diff --git a/src/transformers/models/gpt2/__init__.py b/src/transformers/models/gpt2/__init__.py
index 4724af1e281e..090606b7b837 100644
--- a/src/transformers/models/gpt2/__init__.py
+++ b/src/transformers/models/gpt2/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_gpt2 import *
     from .modeling_gpt2 import *
     from .tokenization_gpt2 import *
-    from .tokenization_gpt2_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index 608164ef2d83..9fe8d768e085 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -14,14 +14,12 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-import json
-import os
-from functools import lru_cache
 from typing import Optional
 
-import regex as re
+from tokenizers import Tokenizer, decoders, pre_tokenizers
+from tokenizers.models import BPE
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
@@ -33,46 +31,7 @@
 }
 
 
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class GPT2Tokenizer(PreTrainedTokenizer):
+class GPT2Tokenizer(TokenizersBackend):
     """
     Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
 
@@ -99,7 +58,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should refer to
     this superclass for more information regarding those methods.
 
     Args:
@@ -125,48 +84,67 @@ class GPT2Tokenizer(PreTrainedTokenizer):
         add_bos_token (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
             word just as any other word.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
         errors="replace",
         unk_token="<|endoftext|>",
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
         pad_token=None,
-        add_prefix_space=False,
+        add_prefix_space=True,
         add_bos_token=False,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        self.add_bos_token = add_bos_token
+        #  self.add_bos_token = add_bos_token
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
         self.add_prefix_space = add_prefix_space
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {}
+
+        if merges is not None:
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        tokenizer_object = self._tokenizer
+
+        # Set these before calling super().__init__() so the base class _post_init() can use them
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = False
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             errors=errors,
             unk_token=unk_token,
             bos_token=bos_token,
@@ -177,158 +155,9 @@ def __init__(
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        if self.add_bos_token:
-            bos_token_ids = [self.bos_token_id]
-        else:
-            bos_token_ids = []
-
-        output = bos_token_ids + token_ids_0
-
-        if token_ids_1 is None:
-            return output
-
-        return output + bos_token_ids + token_ids_1
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if not self.add_bos_token:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=False
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0))
-        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if is_split_into_words or add_prefix_space:
-            text = " " + text
-        return (text, kwargs)
+        # Call _post_init for tokenizers created directly (not from_pretrained)
+        # For from_pretrained, this will be called again after loading the tokenizer from file
+        self._post_init()
 
 
 __all__ = ["GPT2Tokenizer"]
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
deleted file mode 100644
index f81c155e8644..000000000000
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-
-from typing import Optional
-
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_gpt2 import GPT2Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class GPT2TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import GPT2TokenizerFast
-
-    >>> tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
-    >>> tokenizer("Hello world")["input_ids"]
-    [15496, 995]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [18435, 995]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
-    the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = GPT2Tokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-        self.add_bos_token = kwargs.pop("add_bos_token", False)
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["GPT2TokenizerFast"]
diff --git a/src/transformers/models/gpt_neox/__init__.py b/src/transformers/models/gpt_neox/__init__.py
index fdf263901996..91ef201d1126 100644
--- a/src/transformers/models/gpt_neox/__init__.py
+++ b/src/transformers/models/gpt_neox/__init__.py
@@ -20,7 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_gpt_neox import *
     from .modeling_gpt_neox import *
-    from .tokenization_gpt_neox_fast import *
+    from .tokenization_gpt_neox import *
 else:
     import sys
 
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox.py
new file mode 100644
index 000000000000..c57fae09ac87
--- /dev/null
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox.py
@@ -0,0 +1,186 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTNeoX."""
+
+from typing import Optional
+
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
+
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class GPTNeoXTokenizer(TokenizersBackend):
+    """
+    Construct a GPT-NeoX-20B tokenizer (backed by HuggingFace's tokenizers library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import GPTNeoXTokenizer
+
+    >>> tokenizer = GPTNeoXTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    
+
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to a tokenizers JSON file containing the serialization of a tokenizer.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<|padding|>"`):
+            Token for padding a sequence.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a `bos_token` at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an `eos_token` at the end of sequences.
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
+
+    def __init__(
+        self,
+        errors: str = "replace",
+        unk_token: str = "<|endoftext|>",
+        bos_token: str = "<|endoftext|>",
+        eos_token: str = "<|endoftext|>",
+        pad_token: str = "<|padding|>",
+        add_bos_token: bool = False,
+        add_eos_token: bool = False,
+        add_prefix_space: bool = False,
+        trim_offsets: bool = True,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
+        **kwargs,
+    ):
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.add_prefix_space = add_prefix_space
+        self.trim_offsets = trim_offsets
+
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(unk_token): 0,
+                str(pad_token): 1,
+            }
+
+        if merges is not None:
+            self._merges = merges
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        self._tokenizer.normalizer = normalizers.NFC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
+            add_prefix_space=add_prefix_space, trim_offsets=trim_offsets
+        )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True)
+
+        tokenizer_object = self._tokenizer
+
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+            **kwargs,
+        )
+
+        self.update_post_processor()
+
+    def _post_init(self):
+        """Post-initialization to ensure tokenizer settings are applied correctly."""
+        # Re-apply settings to ensure they're correct after loading from pretrained
+        self._tokenizer.normalizer = normalizers.NFC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
+            add_prefix_space=self.add_prefix_space, trim_offsets=self.trim_offsets
+        )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=False, trim_offsets=True)
+
+        # Call parent to handle AddedToken properties
+        super()._post_init()
+
+        # Update post processor with current bos/eos settings
+        self.update_post_processor()
+
+
+__all__ = ["GPTNeoXTokenizer"]
diff --git a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
deleted file mode 100644
index a3b190a60eb1..000000000000
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# coding=utf-8
-# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for GPTNeoX."""
-
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" GPT-NeoX-20B tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import GPTNeoXTokenizerFast
-
-    >>> tokenizer = GPTNeoXTokenizerFast.from_pretrained("openai-community/gpt2")
-    >>> tokenizer("Hello world")["input_ids"]
-    [15496, 995]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [18435, 995]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
-    the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
-            The end of sequence token.
-        pad_token (`str`, *optional*):
-            Token for padding a sequence.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (GPTNeoX tokenizer detect beginning of words by the preceding space).
-        add_bos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add a `bos_token` at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an `eos_token` at the end of sequences.
-        trim_offsets (`bool`, *optional*, defaults to `True`):
-            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        pad_token=None,
-        add_bos_token=False,
-        add_eos_token=False,
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-        self._add_bos_token = add_bos_token
-        self._add_eos_token = add_eos_token
-        self.update_post_processor()
-
-    @property
-    def add_eos_token(self):
-        return self._add_eos_token
-
-    @property
-    def add_bos_token(self):
-        return self._add_bos_token
-
-    @add_eos_token.setter
-    def add_eos_token(self, value):
-        self._add_eos_token = value
-        self.update_post_processor()
-
-    @add_bos_token.setter
-    def add_bos_token(self, value):
-        self._add_bos_token = value
-        self.update_post_processor()
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
-    def update_post_processor(self):
-        """
-        Updates the underlying post processor with the current `bos_token` and `eos_token`.
-        """
-        bos = self.bos_token
-        bos_token_id = self.bos_token_id
-        if bos is None and self.add_bos_token:
-            raise ValueError("add_bos_token = True but bos_token = None")
-
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        if eos is None and self.add_eos_token:
-            raise ValueError("add_eos_token = True but eos_token = None")
-
-        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
-
-        special_tokens = []
-        if self.add_bos_token:
-            special_tokens.append((bos, bos_token_id))
-        if self.add_eos_token:
-            special_tokens.append((eos, eos_token_id))
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=single, pair=pair, special_tokens=special_tokens
-        )
-
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["GPTNeoXTokenizerFast"]
diff --git a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
index 584e74a8123e..e706850dca4e 100644
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from ...tokenization_utils_fast import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -135,6 +135,7 @@ def __init__(
             bos_token=bos_token,
             eos_token=eos_token,
             do_clean_text=do_clean_text,
+            special_tokens_pattern="none",
             **kwargs,
         )
 
diff --git a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py
index f685a751a871..6c80045f251a 100644
--- a/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py
+++ b/src/transformers/models/gpt_oss/convert_gpt_oss_weights_to_hf.py
@@ -332,7 +332,6 @@ def create_safetensors_index(safetensors_index, num_shards, model_path):
         json.dump(safetensors_index, f)
 
 
-# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
 def bytes_to_unicode():
     """
     Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
diff --git a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
index 3019acfd5bcc..ef9a90955dd3 100644
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@@ -1,14 +1,10 @@
 """The tokenizer used by the GPT-SW3 models."""
 
-import os
 import re
 import unicodedata
-from shutil import copyfile
 from typing import Any, Optional, Union
 
-import sentencepiece as spm
-
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_sentencepiece import SentencePieceBackend
 from ...utils import is_torch_available, logging
 from ...utils.import_utils import requires
 
@@ -22,7 +18,7 @@
 
 
 @requires(backends=("sentencepiece",))
-class GPTSw3Tokenizer(PreTrainedTokenizer):
+class GPTSw3Tokenizer(SentencePieceBackend):
     """
     Construct an GPTSw3 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
@@ -86,6 +82,7 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    is_fast = False
 
     def __init__(
         self,
@@ -100,8 +97,6 @@ def __init__(
         sp_model_kwargs: Optional[dict[str, Any]] = None,
         **kwargs,
     ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
         name_or_path = kwargs.get("name_or_path")
         if name_or_path is None:
             logger.warning(
@@ -123,14 +118,10 @@ def __init__(
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
 
         # Used for whitespace normalization in input texts
         # fmt : off
-        self.whitespaces = {" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", "„"}
+        self.whitespaces = {" ", " ", " ", " ", " ", " ", " ", " ", " ", " ", "", ""}
         # fmt : on
 
         # Regular expression to remove non-printing characters (e.g. some unicode control chars) in preprocessing
@@ -138,7 +129,13 @@ def __init__(
             f"[{''.join(map(chr, list(range(0, 9)) + list(range(11, 32)) + list(range(127, 160)) + [160, 173, 8203]))}]"
         )
 
+        # Ensure sp_model_kwargs is in kwargs for proper signature storage
+        # Always add it even if None, parent class will handle the conversion to {}
+        kwargs["sp_model_kwargs"] = sp_model_kwargs if sp_model_kwargs is not None else {}
+
+        # Call parent init (which will load sp_model)
         super().__init__(
+            vocab_file=vocab_file,
             do_lower_case=do_lower_case,
             remove_space=remove_space,
             keep_accents=keep_accents,
@@ -146,32 +143,10 @@ def __init__(
             eos_token=eos_token,
             unk_token=unk_token,
             pad_token=pad_token,
-            sp_model_kwargs=self.sp_model_kwargs,
+            special_tokens_pattern="none",
             **kwargs,
         )
 
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__getstate__
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.__setstate__
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    @property
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.vocab_size
-    def vocab_size(self) -> int:
-        return len(self.sp_model)
-
     def preprocess_text(self, text: str) -> str:
         """
         Returns the preprocessed text. This procedure is identical to what was used when training the tokenizer.
@@ -191,19 +166,6 @@ def _tokenize(self, text: str, **kwargs) -> list[str]:
         text = self.preprocess_text(text)
         return self.sp_model.encode(text, out_type=str)
 
-    def _convert_token_to_id(self, token: str) -> int:
-        """Converts a token (str) to an id (int) using the vocab."""
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (int) to a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        """Returns the input string, this function is overridden to remove the default clean up."""
-        return out_string
-
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         """Converts a sequence of tokens (strings) to a single string. Special tokens remain intact."""
         current_sub_tokens = []
@@ -226,30 +188,6 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
 
         return out_string
 
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.get_vocab
-    def get_vocab(self) -> dict[str, int]:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
     def encode_fast(
         self, text: Union[str, list[str]], return_tensors: Union[str, bool] = False
     ) -> Union[list[int], list[list[int]], "torch.Tensor"]:
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 910840bd661c..f0af7a797ab7 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -18,7 +18,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils import PreTokenizedInput, TextInput
+from ...tokenization_python import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
 from ...utils.import_utils import requires_backends
 
diff --git a/src/transformers/models/herbert/__init__.py b/src/transformers/models/herbert/__init__.py
index e0d0794a06e8..77c7ec616f1f 100644
--- a/src/transformers/models/herbert/__init__.py
+++ b/src/transformers/models/herbert/__init__.py
@@ -19,7 +19,6 @@
 
 if TYPE_CHECKING:
     from .tokenization_herbert import *
-    from .tokenization_herbert_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index c1c6bacc87fc..af3c259c0a28 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -12,606 +12,118 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import json
-import os
-import re
-import unicodedata
+
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import BPE
+
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-
-
-# Copied from transformers.models.xlm.tokenization_xlm.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
-    strings)
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-# Copied from transformers.models.xlm.tokenization_xlm.replace_unicode_punct
-def replace_unicode_punct(text):
-    """
-    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
-    """
-    text = text.replace(",", ",")
-    text = re.sub(r"。\s*", ". ", text)
-    text = text.replace("、", ",")
-    text = text.replace("”", '"')
-    text = text.replace("“", '"')
-    text = text.replace("∶", ":")
-    text = text.replace(":", ":")
-    text = text.replace("?", "?")
-    text = text.replace("《", '"')
-    text = text.replace("》", '"')
-    text = text.replace(")", ")")
-    text = text.replace("!", "!")
-    text = text.replace("(", "(")
-    text = text.replace(";", ";")
-    text = text.replace("1", "1")
-    text = text.replace("」", '"')
-    text = text.replace("「", '"')
-    text = text.replace("0", "0")
-    text = text.replace("3", "3")
-    text = text.replace("2", "2")
-    text = text.replace("5", "5")
-    text = text.replace("6", "6")
-    text = text.replace("9", "9")
-    text = text.replace("7", "7")
-    text = text.replace("8", "8")
-    text = text.replace("4", "4")
-    text = re.sub(r".\s*", ". ", text)
-    text = text.replace("~", "~")
-    text = text.replace("’", "'")
-    text = text.replace("…", "...")
-    text = text.replace("━", "-")
-    text = text.replace("〈", "<")
-    text = text.replace("〉", ">")
-    text = text.replace("【", "[")
-    text = text.replace("】", "]")
-    text = text.replace("%", "%")
-    return text
-
-
-# Copied from transformers.models.xlm.tokenization_xlm.remove_non_printing_char
-def remove_non_printing_char(text):
-    """
-    Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
-    """
-    output = []
-    for char in text:
-        cat = unicodedata.category(char)
-        if cat.startswith("C"):
-            continue
-        output.append(char)
-    return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
 
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
+class HerbertTokenizer(TokenizersBackend):
     """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class HerbertTokenizer(PreTrainedTokenizer):
-    """
-    Construct a BPE tokenizer for HerBERT.
+    Construct a BPE tokenizer for HerBERT (backed by HuggingFace's tokenizers library).
 
     Peculiarities:
 
-    - uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
-      punctuation character will be treated separately.
-
-    - Such pretokenized input is BPE subtokenized
+    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
+      a punctuation character will be treated separately.
 
-    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the methods. Users should refer to the
     superclass for more information regarding methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        cls_token (`str`, *optional*, defaults to `""`):
+            The classifier token.
+        unk_token (`str`, *optional*, defaults to `""`):
+            The unknown token.
+        pad_token (`str`, *optional*, defaults to `""`):
+            The padding token.
+        mask_token (`str`, *optional*, defaults to `""`):
+            The mask token.
+        sep_token (`str`, *optional*, defaults to `""`):
+            The separator token.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary.
+        merges (`list`, *optional*):
+            Custom merges list.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
-        tokenizer_file=None,
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        sep_token="",
-        bos_token="",
-        do_lowercase_and_remove_accent=False,
-        additional_special_tokens=[
-            "",
-            "",
-            "",
-            "",
-            "",
-            "",
-            "",
-            "",
-            "",
-            "",
-        ],
-        lang2id=None,
-        id2lang=None,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
+        cls_token: str = "",
+        unk_token: str = "",
+        pad_token: str = "",
+        mask_token: str = "",
+        sep_token: str = "",
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
         **kwargs,
     ):
-        try:
-            import sacremoses
-        except ImportError:
-            raise ImportError(
-                "You need to install sacremoses to use HerbertTokenizer. "
-                "See https://pypi.org/project/sacremoses/ for installation."
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
             )
+        else:
+            self._vocab = {}
 
-        self.sm = sacremoses
+        if merges is not None:
+            # Convert lists to tuples if necessary (happens when loading from JSON)
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                unk_token=str(unk_token),
+                end_of_word_suffix="",
+            )
+        )
 
-        # cache of sm.MosesPunctNormalizer instance
-        self.cache_moses_punct_normalizer = {}
-        # cache of sm.MosesTokenizer instance
-        self.cache_moses_tokenizer = {}
-        self.lang_with_custom_tokenizer = {"zh", "th", "ja"}
-        # True for current supported model (v1.2.0), False for XLM-17 & 100
-        self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent
-        self.lang2id = lang2id
-        self.id2lang = id2lang
-        if lang2id is not None and id2lang is not None:
-            assert len(lang2id) == len(id2lang)
+        self._tokenizer.normalizer = normalizers.BertNormalizer(
+            lowercase=False, strip_accents=False, clean_text=True, handle_chinese_chars=True
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+        self._tokenizer.decoder = decoders.BPEDecoder(suffix="")
 
-        self.ja_word_tokenizer = None
-        self.zh_word_tokenizer = None
+        tokenizer_object = self._tokenizer
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[:-1]
-        merges = [tuple(merge.split()[:2]) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            cls_token=cls_token,
             unk_token=unk_token,
-            bos_token=bos_token,
-            sep_token=sep_token,
             pad_token=pad_token,
-            cls_token=cls_token,
             mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            lang2id=lang2id,
-            id2lang=id2lang,
-            do_lowercase_and_remove_accent=do_lowercase_and_remove_accent,
-            tokenizer_file=None,
+            sep_token=sep_token,
             **kwargs,
         )
 
-        self.bert_pre_tokenizer = BasicTokenizer(
-            do_lower_case=False,
-            never_split=self.all_special_tokens,
-            tokenize_chinese_chars=False,
-            strip_accents=False,
-        )
-
-    @property
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.do_lower_case
-    def do_lower_case(self):
-        return self.do_lowercase_and_remove_accent
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_punct_norm
-    def moses_punct_norm(self, text, lang):
-        if lang not in self.cache_moses_punct_normalizer:
-            punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
-            self.cache_moses_punct_normalizer[lang] = punct_normalizer
-        else:
-            punct_normalizer = self.cache_moses_punct_normalizer[lang]
-        return punct_normalizer.normalize(text)
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_tokenize
-    def moses_tokenize(self, text, lang):
-        if lang not in self.cache_moses_tokenizer:
-            moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
-            self.cache_moses_tokenizer[lang] = moses_tokenizer
-        else:
-            moses_tokenizer = self.cache_moses_tokenizer[lang]
-        return moses_tokenizer.tokenize(text, return_str=False, escape=False)
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.moses_pipeline
-    def moses_pipeline(self, text, lang):
-        text = replace_unicode_punct(text)
-        text = self.moses_punct_norm(text, lang)
-        text = remove_non_printing_char(text)
-        return text
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.ja_tokenize
-    def ja_tokenize(self, text):
-        if self.ja_word_tokenizer is None:
-            try:
-                import Mykytea
-
-                self.ja_word_tokenizer = Mykytea.Mykytea(
-                    f"-model {os.path.expanduser('~')}/local/share/kytea/model.bin"
-                )
-            except (AttributeError, ImportError):
-                logger.error(
-                    "Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper"
-                    " (https://github.com/chezou/Mykytea-python) with the following steps"
-                )
-                logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea")
-                logger.error("2. autoreconf -i")
-                logger.error("3. ./configure --prefix=$HOME/local")
-                logger.error("4. make && make install")
-                logger.error("5. pip install kytea")
-                raise
-        return list(self.ja_word_tokenizer.getWS(text))
-
-    @property
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.encoder)
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.bpe
-    def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + "",)
-        if token in self.cache:
-            return self.cache[token]
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token + ""
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        if word == "\n  ":
-            word = "\n"
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        pre_tokens = self.bert_pre_tokenizer.tokenize(text)
-
-        split_tokens = []
-        for token in pre_tokens:
-            if token:
-                split_tokens.extend(list(self.bpe(token).split(" ")))
-
-        return split_tokens
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = "".join(tokens).replace("", " ").strip()
-        return out_string
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-
-        """
-        bos = [self.bos_token_id]
-        sep = [self.sep_token_id]
-
-        if token_ids_1 is None:
-            return bos + token_ids_0 + sep
-        return bos + token_ids_0 + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        self._tokenizer.post_processor = processors.BertProcessing(
+            sep=(self.sep_token, 2),
+            cls=(self.cls_token, 0),
         )
 
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__getstate__
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sm"] = None
-        return state
-
-    # Copied from transformers.models.xlm.tokenization_xlm.XLMTokenizer.__setstate__
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        try:
-            import sacremoses
-        except ImportError:
-            raise ImportError(
-                "You need to install sacremoses to use XLMTokenizer. "
-                "See https://pypi.org/project/sacremoses/ for installation."
-            )
-
-        self.sm = sacremoses
-
 
 __all__ = ["HerbertTokenizer"]
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
deleted file mode 100644
index fdc24e3c6a6e..000000000000
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Language Team Authors, Allegro.pl, Facebook Inc. and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_herbert import HerbertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class HerbertTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).
-
-    Peculiarities:
-
-    - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
-      a punctuation character will be treated separately.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users should refer to the
-    superclass for more information regarding methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = HerbertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        sep_token="",
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            sep_token=sep_token,
-            **kwargs,
-        )
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An HerBERT, like BERT sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["HerbertTokenizerFast"]
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index df5f9ca73a8b..b21f6126b1c4 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -16,6 +16,7 @@
 Processor class for IDEFICS2.
 """
 
+import re
 from itertools import accumulate
 from typing import TYPE_CHECKING, Optional, Union
 
@@ -188,11 +189,14 @@ def __call__(
                 image_str = image_str * 5
 
             prompt_strings = []
+            closing_fake_pattern = re.compile(rf"{re.escape(fake_image_token)}(?=[^\s<])")
             for sample in text:
                 n_images_in_text.append(sample.count(image_token))
                 sample = sample.replace(image_token, image_str)
                 # Remove any double fake tokens if images are adjacent
                 sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}")
+                # Ensure words attached directly after the closing fake token remain word-boundary aligned
+                sample = closing_fake_pattern.sub(f"{fake_image_token} ", sample)
                 prompt_strings.append(sample)
 
             text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index d6fd1e6ec758..736cc7d917f6 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -22,7 +22,7 @@
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
-from ...tokenization_utils import AddedToken
+from ...tokenization_python import AddedToken
 from ...tokenization_utils_base import BatchEncoding, TextInput
 
 
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 5d1ec20c75de..25f2eed74107 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -55,8 +55,8 @@ class Kosmos2_5Processor(ProcessorMixin):
     Args:
         image_processor (`Kosmos2_5ImageProcessor`):
             An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+        tokenizer (`T5Tokenizer`):
+            An instance of ['T5Tokenizer`]. The tokenizer is a required input.
         num_image_tokens (`int`, *optional*, defaults to 2048):
             Number of image tokens used as a placeholder.
     """
diff --git a/src/transformers/models/layoutlm/__init__.py b/src/transformers/models/layoutlm/__init__.py
index 5db595015b49..5edb5482146e 100644
--- a/src/transformers/models/layoutlm/__init__.py
+++ b/src/transformers/models/layoutlm/__init__.py
@@ -18,10 +18,10 @@
 
 
 if TYPE_CHECKING:
+    from ..bert.tokenization_bert import BertTokenizer as LayoutLMTokenizer
+    from ..bert.tokenization_bert import BertTokenizer as LayoutLMTokenizerFast
     from .configuration_layoutlm import *
     from .modeling_layoutlm import *
-    from .tokenization_layoutlm import *
-    from .tokenization_layoutlm_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
deleted file mode 100644
index 4caccd691d0e..000000000000
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ /dev/null
@@ -1,483 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for model LayoutLM."""
-
-import collections
-import os
-import unicodedata
-from typing import Optional
-
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->LayoutLM,BERT->LayoutLM
-class LayoutLMTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a LayoutLM tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original LayoutLM).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = LayoutLMTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A LayoutLM sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-__all__ = ["LayoutLMTokenizer"]
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
deleted file mode 100644
index c7ade6e0b8cd..000000000000
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for model LayoutLM."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_layoutlm import LayoutLMTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->LayoutLM,BERT->LayoutLM
-class LayoutLMTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" LayoutLM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original LayoutLM).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = LayoutLMTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A LayoutLM sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["LayoutLMTokenizerFast"]
diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py
index b68a523c0b0c..b22c0ee86d91 100644
--- a/src/transformers/models/layoutlmv2/__init__.py
+++ b/src/transformers/models/layoutlmv2/__init__.py
@@ -25,7 +25,6 @@
     from .modeling_layoutlmv2 import *
     from .processing_layoutlmv2 import *
     from .tokenization_layoutlmv2 import *
-    from .tokenization_layoutlmv2_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index a4c04598d855..bdf0bed1ed3f 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,31 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization class for LayoutLMv2."""
+"""
+Tokenization class for LayoutLMv2. Based on WordPiece.
+"""
 
-import collections
-import os
-import sys
-import unicodedata
 from typing import Optional, Union
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
+
 from ...tokenization_utils_base import (
     BatchEncoding,
     EncodedInput,
+    PaddingStrategy,
     PreTokenizedInput,
+    TensorType,
     TextInput,
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
 
+# Docstring constants for encode methods
 LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
@@ -78,11 +80,6 @@
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 
 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
@@ -112,95 +109,73 @@
             verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
             **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model.
-
-              [What are input IDs?](../glossary#input-ids)
-
-            - **bbox** -- List of bounding boxes to be fed to a model.
-
-            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
-              if *"token_type_ids"* is in `self.model_input_names`).
-
-              [What are token type IDs?](../glossary#token-type-ids)
-
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
-
-              [What are attention masks?](../glossary#attention-mask)
-
-            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when `return_length=True`).
 """
 
 
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P"))
-
-
-def subfinder(mylist, pattern):
-    matches = []
-    indices = []
-    for idx, i in enumerate(range(len(mylist))):
-        if mylist[i] == pattern[0] and mylist[i : i + len(pattern)] == pattern:
-            matches.append(pattern)
-            indices.append(idx)
-    if matches:
-        return matches[0], indices[0]
-    else:
-        return None, 0
-
-
-class LayoutLMv2Tokenizer(PreTrainedTokenizer):
+class LayoutLMv2Tokenizer(TokenizersBackend):
     r"""
-    Construct a LayoutLMv2 tokenizer. Based on WordPiece. [`LayoutLMv2Tokenizer`] can be used to turn words, word-level
-    bounding boxes and optional word labels to token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and
-    optional `labels` (for token classification).
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-    [`LayoutLMv2Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
-    word-level bounding boxes into token-level bounding boxes.
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original LayoutLMv2).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = None
+
+    @staticmethod
+    def _load_vocab_from_file(vocab_file):
+        """Load vocab from a BERT-style vocab file (one token per line)."""
+        vocab = {}
+        with open(vocab_file, "r", encoding="utf-8") as reader:
+            for index, line in enumerate(reader):
+                token = line.rstrip("\n")
+                vocab[token] = index
+        return vocab
 
     def __init__(
         self,
-        vocab_file,
+        vocab=None,
+        vocab_file=None,
         do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
         unk_token="[UNK]",
         sep_token="[SEP]",
         pad_token="[PAD]",
@@ -213,32 +188,52 @@ def __init__(
         only_label_first_subword=True,
         tokenize_chinese_chars=True,
         strip_accents=None,
-        model_max_length: int = 512,
-        additional_special_tokens: Optional[list[str]] = None,
         **kwargs,
     ):
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
-
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        self.vocab_file = vocab_file
+        self.do_lower_case = do_lower_case
+
+        # Build vocab for WordPiece
+        if vocab is not None:
+            if isinstance(vocab, dict):
+                _vocab = vocab
+            else:
+                raise ValueError("vocab must be a dict mapping tokens to ids")
+        elif vocab_file is not None:
+            # Load vocab from file (BERT format: one token per line)
+            _vocab = self._load_vocab_from_file(vocab_file)
+        else:
+            # Initialize with at least the special tokens for WordPiece
+            _vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+            }
+
+        # Initialize WordPiece tokenizer
+        self._tokenizer = Tokenizer(models.WordPiece(vocab=_vocab, unk_token=str(unk_token)))
+
+        # Set normalizer
+        self._tokenizer.normalizer = normalizers.BertNormalizer(
+            clean_text=True,
+            handle_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=do_lower_case,
+        )
+
+        # Set pre_tokenizer
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+        # Set decoder
+        self._tokenizer.decoder = decoders.WordPiece(prefix="##")
+
+        # Set post_processor (will be set after super().__init__ when we have token IDs)
+        # Temporarily set to None, will be configured after parent init
+        self._tokenizer.post_processor = None
+
+        tokenizer_object = self._tokenizer
 
         # additional properties
         self.cls_token_box = cls_token_box
@@ -246,10 +241,10 @@ def __init__(
         self.pad_token_box = pad_token_box
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
+
         super().__init__(
+            tokenizer_object=tokenizer_object,
             do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
             unk_token=unk_token,
             sep_token=sep_token,
             pad_token=pad_token,
@@ -262,120 +257,30 @@ def __init__(
             only_label_first_subword=only_label_first_subword,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
-            model_max_length=model_max_length,
-            additional_special_tokens=additional_special_tokens,
             **kwargs,
         )
 
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
+        # Now set post_processor with actual token IDs
+        cls = str(self.cls_token)
+        sep = str(self.sep_token)
+        cls_token_id = self.cls_token_id
+        sep_token_id = self.sep_token_id
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls}:0 $A:0 {sep}:0",
+            pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
+            special_tokens=[
+                (cls, cls_token_id),
+                (sep, sep_token_id),
+            ],
+        )
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
 
     @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
@@ -590,185 +495,13 @@ def batch_encode_plus(
             **kwargs,
         )
 
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
-
-        batch_outputs = self._batch_prepare_for_model(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
-        )
-
-        return BatchEncoding(batch_outputs)
-
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def _batch_prepare_for_model(
-        self,
-        batch_text_or_text_pairs,
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens.
-
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
-
-        batch_outputs = {}
-        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
-            batch_text_or_text_pair, boxes_example = example
-            outputs = self.prepare_for_model(
-                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
-                batch_text_or_text_pair[1] if is_pair else None,
-                boxes_example,
-                word_labels=word_labels[idx] if word_labels is not None else None,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
-                return_token_type_ids=return_token_type_ids,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
-                verbose=verbose,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-        return batch_outputs
-
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING)
-    def encode(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> list[int]:
-        encoded_inputs = self.encode_plus(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
         )
 
-        return encoded_inputs["input_ids"]
+        return encodings[0].tokens if encodings else []
 
     @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def encode_plus(
@@ -839,12 +572,16 @@ def encode_plus(
             **kwargs,
         )
 
-    def _encode_plus(
+    def _batch_encode_plus(
         self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
@@ -852,7 +589,7 @@ def _encode_plus(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -860,54 +597,153 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
-        return self.prepare_for_model(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
-            return_tensors=return_tensors,
-            prepend_batch_axis=True,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            verbose=verbose,
         )
 
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def prepare_for_model(
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: Tuple[
+        #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
+        #                       List[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
         self,
         text: Union[TextInput, PreTokenizedInput],
         text_pair: Optional[PreTokenizedInput] = None,
         boxes: Optional[list[list[int]]] = None,
         word_labels: Optional[list[int]] = None,
         add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -915,321 +751,52 @@ def prepare_for_model(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        prepend_batch_axis: bool = False,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
-        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
-        combination of arguments will raise an error.
-
-        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
-        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
-        labeled with -100, such that they will be ignored by the loss function.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`List[str]` or `List[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
+            stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
             verbose=verbose,
             **kwargs,
         )
 
-        tokens = []
-        pair_tokens = []
-        token_boxes = []
-        pair_token_boxes = []
-        labels = []
-
-        if text_pair is None:
-            if word_labels is None:
-                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
-                for word, box in zip(text, boxes):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-            else:
-                # CASE 2: token classification (training)
-                for word, box, label in zip(text, boxes, word_labels):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-                    if self.only_label_first_subword:
-                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
-                    else:
-                        labels.extend([label] * len(word_tokens))
-        else:
-            # CASE 3: document visual question answering (inference)
-            # text = question
-            # text_pair = words
-            tokens = self.tokenize(text)
-            token_boxes = [self.pad_token_box for _ in range(len(tokens))]
-
-            for word, box in zip(text_pair, boxes):
-                if len(word) < 1:  # skip empty words
-                    continue
-                word_tokens = self.tokenize(word)
-                pair_tokens.extend(word_tokens)
-                pair_token_boxes.extend([box] * len(word_tokens))
-
-        # Create ids + pair_ids
-        ids = self.convert_tokens_to_ids(tokens)
-        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
-
-        if (
-            return_overflowing_tokens
-            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-            and pair_ids is not None
-        ):
-            raise ValueError(
-                "Not possible to return overflowing tokens for pair of sequences with the "
-                "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                "for instance `only_second` or `only_first`."
-            )
-
-        # Compute the total size of the returned encodings
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            (
-                ids,
-                token_boxes,
-                pair_ids,
-                pair_token_boxes,
-                labels,
-                overflowing_tokens,
-                overflowing_token_boxes,
-                overflowing_labels,
-            ) = self.truncate_sequences(
-                ids,
-                token_boxes,
-                pair_ids=pair_ids,
-                pair_token_boxes=pair_token_boxes,
-                labels=labels,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
-            )
-
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
-            encoded_inputs["overflowing_labels"] = overflowing_labels
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
-            if pair_token_boxes:
-                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
-            if labels:
-                labels = [self.pad_token_label] + labels + [self.pad_token_label]
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-        if labels:
-            encoded_inputs["labels"] = labels
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
             )
 
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        token_boxes: list[list[int]],
-        pair_ids: Optional[list[int]] = None,
-        pair_token_boxes: Optional[list[list[int]]] = None,
-        labels: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Truncates a sequence pair in-place following the strategy.
-
-        Args:
-            ids (`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            token_boxes (`List[List[int]]`):
-                Bounding boxes of the first sequence.
-            pair_ids (`List[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            pair_token_boxes (`List[List[int]]`, *optional*):
-                Bounding boxes of the second sequence.
-            labels (`List[int]`, *optional*):
-                Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                The strategy to follow for truncation. Can be:
-
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-
-        Returns:
-            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
-            of sequences (or a batch of pairs) is provided.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
-
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
-            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
-        ):
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                overflowing_tokens = ids[-window_len:]
-                overflowing_token_boxes = token_boxes[-window_len:]
-                overflowing_labels = labels[-window_len:]
-                ids = ids[:-num_tokens_to_remove]
-                token_boxes = token_boxes[:-num_tokens_to_remove]
-                labels = labels[:-num_tokens_to_remove]
-            else:
-                error_msg = (
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                )
-                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
-                    error_msg = (
-                        error_msg + "Please select another truncation strategy than "
-                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
-                    )
-                logger.error(error_msg)
-        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            logger.warning(
-                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
-                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
-                "truncation strategy. So the returned list will always be empty even if some "
-                "tokens have been removed."
-            )
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    ids = ids[:-1]
-                    token_boxes = token_boxes[:-1]
-                    labels = labels[:-1]
-                else:
-                    pair_ids = pair_ids[:-1]
-                    pair_token_boxes = pair_token_boxes[:-1]
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                overflowing_tokens = pair_ids[-window_len:]
-                overflowing_token_boxes = pair_token_boxes[-window_len:]
-                pair_ids = pair_ids[:-num_tokens_to_remove]
-                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
 
-        return (
-            ids,
-            token_boxes,
-            pair_ids,
-            pair_token_boxes,
-            labels,
-            overflowing_tokens,
-            overflowing_token_boxes,
-            overflowing_labels,
-        )
+        return batched_output
 
     def _pad(
         self,
@@ -1320,225 +887,32 @@ def _pad(
 
         return encoded_inputs
 
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
 
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
 
         Returns:
-            A list of wordpiece tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+        if token_ids_1:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
 
 
 __all__ = ["LayoutLMv2Tokenizer"]
+
+# Backward alias
+LayoutLMv2TokenizerFast = LayoutLMv2Tokenizer
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
deleted file mode 100644
index 8e324ee0b8fe..000000000000
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ /dev/null
@@ -1,789 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fast tokenization class for LayoutLMv2. It overwrites 2 methods of the slow tokenizer class, namely _batch_encode_plus
-and _encode_plus, in which the Rust tokenizer is used.
-"""
-
-import json
-from typing import Optional, Union
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_base import (
-    BatchEncoding,
-    EncodedInput,
-    PaddingStrategy,
-    PreTokenizedInput,
-    TensorType,
-    TextInput,
-    TextInputPair,
-    TruncationStrategy,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import add_end_docstrings, logging
-from .tokenization_layoutlmv2 import (
-    LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING,
-    LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
-    LayoutLMv2Tokenizer,
-)
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [CLS] token.
-        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
-            The bounding box to use for the special [SEP] token.
-        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [PAD] token.
-        pad_token_label (`int`, *optional*, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
-            CrossEntropyLoss.
-        only_label_first_subword (`bool`, *optional*, defaults to `True`):
-            Whether or not to only label the first subword, in case word labels are provided.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original LayoutLMv2).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = LayoutLMv2Tokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        cls_token_box=[0, 0, 0, 0],
-        sep_token_box=[1000, 1000, 1000, 1000],
-        pad_token_box=[0, 0, 0, 0],
-        pad_token_label=-100,
-        only_label_first_subword=True,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
-            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
-        ):
-            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
-            pre_tok_state["lowercase"] = do_lower_case
-            pre_tok_state["strip_accents"] = strip_accents
-            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
-
-        self.do_lower_case = do_lower_case
-
-        # additional properties
-        self.cls_token_box = cls_token_box
-        self.sep_token_box = sep_token_box
-        self.pad_token_box = pad_token_box
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
-
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with word-level normalized bounding boxes and optional labels.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                words).
-            text_pair (`List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
-                (pretokenized string).
-            boxes (`List[List[int]]`, `List[List[List[int]]]`):
-                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (`List[int]`, `List[List[int]]`, *optional*):
-                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
-        """
-
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if text_pair is not None:
-            # in case text + text_pair are provided, text = questions, text_pair = words
-            if not _is_valid_text_input(text):
-                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
-            if not isinstance(text_pair, (list, tuple)):
-                raise ValueError(
-                    "Words must be of type `List[str]` (single pretokenized example), "
-                    "or `List[List[str]]` (batch of pretokenized examples)."
-                )
-        else:
-            # in case only text is provided => must be words
-            if not isinstance(text, (list, tuple)):
-                raise ValueError(
-                    "Words must be of type `List[str]` (single pretokenized example), "
-                    "or `List[List[str]]` (batch of pretokenized examples)."
-                )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-
-        words = text if text_pair is None else text_pair
-        if boxes is None:
-            raise ValueError("You must provide corresponding bounding boxes")
-        if is_batched:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide words and boxes for an equal amount of examples")
-            for words_example, boxes_example in zip(words, boxes):
-                if len(words_example) != len(boxes_example):
-                    raise ValueError("You must provide as many words as there are bounding boxes")
-        else:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide as many words as there are bounding boxes")
-
-        if is_batched:
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            is_pair = bool(text_pair is not None)
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                is_pair=is_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._batch_encode_plus(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        batched_input = [(text, pair)] if pair else [text]
-        encodings = self._tokenizer.encode_batch(
-            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
-        )
-
-        return encodings[0].tokens
-
-    @add_end_docstrings(LAYOUTLMV2_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        `__call__` should be used instead.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`List[str]` or `List[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus(
-            text=text,
-            boxes=boxes,
-            text_pair=text_pair,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        if not isinstance(batch_text_or_text_pairs, list):
-            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
-
-        # Set the truncation and padding strategy and restore the initial configuration
-        self.set_truncation_and_padding(
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-        )
-
-        if is_pair:
-            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
-
-        encodings = self._tokenizer.encode_batch(
-            batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
-        )
-
-        # Convert encoding to dict
-        # `Tokens` has type: Tuple[
-        #                       List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]],
-        #                       List[EncodingFast]
-        #                    ]
-        # with nested dimensions corresponding to batch, overflows, sequence length
-        tokens_and_encodings = [
-            self._convert_encoding(
-                encoding=encoding,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=True
-                if word_labels is not None
-                else return_offsets_mapping,  # we use offsets to create the labels
-                return_length=return_length,
-                verbose=verbose,
-            )
-            for encoding in encodings
-        ]
-
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
-        sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0]:
-            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
-            sanitized_tokens[key] = stack
-        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
-
-        # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
-        if return_overflowing_tokens:
-            overflow_to_sample_mapping = []
-            for i, (toks, _) in enumerate(tokens_and_encodings):
-                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
-            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-
-        for input_ids in sanitized_tokens["input_ids"]:
-            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-
-        # create the token boxes
-        token_boxes = []
-        for batch_index in range(len(sanitized_tokens["input_ids"])):
-            if return_overflowing_tokens:
-                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-            else:
-                original_index = batch_index
-            token_boxes_example = []
-            for id, sequence_id, word_id in zip(
-                sanitized_tokens["input_ids"][batch_index],
-                sanitized_encodings[batch_index].sequence_ids,
-                sanitized_encodings[batch_index].word_ids,
-            ):
-                if word_id is not None:
-                    if is_pair and sequence_id == 0:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        token_boxes_example.append(boxes[original_index][word_id])
-                else:
-                    if id == self.cls_token_id:
-                        token_boxes_example.append(self.cls_token_box)
-                    elif id == self.sep_token_id:
-                        token_boxes_example.append(self.sep_token_box)
-                    elif id == self.pad_token_id:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        raise ValueError("Id not recognized")
-            token_boxes.append(token_boxes_example)
-
-        sanitized_tokens["bbox"] = token_boxes
-
-        # optionally, create the labels
-        if word_labels is not None:
-            labels = []
-            for batch_index in range(len(sanitized_tokens["input_ids"])):
-                if return_overflowing_tokens:
-                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-                else:
-                    original_index = batch_index
-                labels_example = []
-                for id, offset, word_id in zip(
-                    sanitized_tokens["input_ids"][batch_index],
-                    sanitized_tokens["offset_mapping"][batch_index],
-                    sanitized_encodings[batch_index].word_ids,
-                ):
-                    if word_id is not None:
-                        if self.only_label_first_subword:
-                            if offset[0] == 0:
-                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                                labels_example.append(word_labels[original_index][word_id])
-                            else:
-                                labels_example.append(self.pad_token_label)
-                        else:
-                            labels_example.append(word_labels[original_index][word_id])
-                    else:
-                        labels_example.append(self.pad_token_label)
-                labels.append(labels_example)
-
-            sanitized_tokens["labels"] = labels
-            # finally, remove offsets if the user didn't want them
-            if not return_offsets_mapping:
-                del sanitized_tokens["offset_mapping"]
-
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
-
-    def _encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # make it a batched input
-        # 2 options:
-        # 1) only text, in case text must be a list of str
-        # 2) text + text_pair, in which case text = str and text_pair a list of str
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_boxes = [boxes]
-        batched_word_labels = [word_labels] if word_labels is not None else None
-        batched_output = self._batch_encode_plus(
-            batched_input,
-            is_pair=bool(text_pair is not None),
-            boxes=batched_boxes,
-            word_labels=batched_word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
-            batched_output = BatchEncoding(
-                {
-                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
-                    for key, value in batched_output.items()
-                },
-                batched_output.encodings,
-            )
-
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
-        return batched_output
-
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side:
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            padding_side = padding_side if padding_side is not None else self.padding_side
-            if padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(padding_side))
-
-        return encoded_inputs
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["LayoutLMv2TokenizerFast"]
diff --git a/src/transformers/models/layoutlmv3/__init__.py b/src/transformers/models/layoutlmv3/__init__.py
index c87afd9c58e7..0a68bf55a15d 100644
--- a/src/transformers/models/layoutlmv3/__init__.py
+++ b/src/transformers/models/layoutlmv3/__init__.py
@@ -25,7 +25,6 @@
     from .modeling_layoutlmv3 import *
     from .processing_layoutlmv3 import *
     from .tokenization_layoutlmv3 import *
-    from .tokenization_layoutlmv3_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
index 7877c5b4668d..c612d66c68d0 100644
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3.py
@@ -15,22 +15,22 @@
 """Tokenization class for LayoutLMv3. Same as LayoutLMv2, but RoBERTa-like BPE tokenization instead of WordPiece."""
 
 import json
-import os
-from functools import lru_cache
 from typing import Optional, Union
 
-import regex as re
+from tokenizers import Tokenizer, decoders, models, pre_tokenizers, processors
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...tokenization_utils_base import (
     BatchEncoding,
     EncodedInput,
+    PaddingStrategy,
     PreTokenizedInput,
+    TensorType,
     TextInput,
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
@@ -38,9 +38,10 @@
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
     "merges_file": "merges.txt",
+    "tokenizer_file": "tokenizer.json",
 }
 
-
+# Docstring constants for encode methods
 LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
@@ -82,144 +83,53 @@
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 
-
 LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Activates and controls padding. Accepts the following values:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                Activates and controls truncation. Accepts the following values:
-
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  sequences (or a batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  greater than the model maximum admissible input size).
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
-                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
-                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
-                truncation/padding to a maximum length will be deactivated.
-            stride (`int`, *optional*, defaults to 0):
-                If set to a number along with `max_length`, the overflowing tokens returned when
-                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
-                returned to provide some overlap between truncated and overflowing sequences. The value of this
-                argument defines the number of overlapping tokens.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
 """
 
 
-@lru_cache
-# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.roberta.tokenization_roberta.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class LayoutLMv3Tokenizer(PreTrainedTokenizer):
+class LayoutLMv3Tokenizer(TokenizersBackend):
     r"""
-    Construct a LayoutLMv3 tokenizer. Based on [`RoBERTatokenizer`] (Byte Pair Encoding or BPE).
-    [`LayoutLMv3Tokenizer`] can be used to turn words, word-level bounding boxes and optional word labels to
-    token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token
-    classification).
+    Construct a LayoutLMv3 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level BPE.
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    [`LayoutLMv3Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It also turns the
-    word-level bounding boxes into token-level bounding boxes.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
         bos_token (`str`, *optional*, defaults to `""`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
         eos_token (`str`, *optional*, defaults to `""`):
             The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
         sep_token (`str`, *optional*, defaults to `""`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
@@ -237,7 +147,7 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
             modeling. This is the token which the model will try to predict.
         add_prefix_space (`bool`, *optional*, defaults to `True`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+            other word.
         cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [CLS] token.
         sep_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
@@ -249,15 +159,18 @@ class LayoutLMv3Tokenizer(PreTrainedTokenizer):
             CrossEntropyLoss.
         only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file when using from_pretrained.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file when using from_pretrained.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask", "bbox"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
         errors="replace",
         bos_token="",
         eos_token="",
@@ -272,33 +185,59 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
+        vocab_file: Optional[str] = None,
+        merges_file: Optional[str] = None,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
         self.add_prefix_space = add_prefix_space
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        # Build vocab and merges for BPE
+        # Priority: 1) vocab/merges dicts/lists, 2) vocab_file/merges_file paths, 3) empty
+        if vocab is not None:
+            _vocab = vocab
+        elif vocab_file is not None:
+            with open(vocab_file, encoding="utf-8") as f:
+                _vocab = json.load(f)
+        else:
+            _vocab = {}
+
+        if merges is not None:
+            _merges = merges
+        elif merges_file is not None:
+            _merges = []
+            with open(merges_file, encoding="utf-8") as f:
+                for line in f:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        _merges.append(tuple(line.split()))
+        else:
+            _merges = []
+
+        # Initialize BPE tokenizer
+        self._tokenizer = Tokenizer(
+            models.BPE(
+                vocab=_vocab,
+                merges=_merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        # Set pre_tokenizer (ByteLevel)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+
+        # Set decoder
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        # Set post_processor (will be set after super().__init__ when we have token IDs)
+        # Temporarily set to None, will be configured after parent init
+        self._tokenizer.post_processor = None
+
+        tokenizer_object = self._tokenizer
 
         # additional properties
         self.cls_token_box = cls_token_box
@@ -308,12 +247,13 @@ def __init__(
         self.only_label_first_subword = only_label_first_subword
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             errors=errors,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
             add_prefix_space=add_prefix_space,
@@ -325,210 +265,31 @@ def __init__(
             **kwargs,
         )
 
-    @property
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.encoder)
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab
-    def get_vocab(self):
-        vocab = dict(self.encoder).copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
+        # Now set post_processor with actual token IDs (RoBERTa-style)
+        cls = str(self.cls_token)
+        sep = str(self.sep_token)
+        cls_token_id = self.cls_token_id
+        sep_token_id = self.sep_token_id
 
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        self._tokenizer.post_processor = processors.RobertaProcessing(
+            sep=(sep, sep_token_id),
+            cls=(cls, cls_token_id),
+            add_prefix_space=add_prefix_space,
+            trim_offsets=True,
         )
 
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
 
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        # If the text starts with a token that should not be split, no space is added before the text in any case.
-        # It's necessary to match the fast tokenization
-        if (
-            (is_split_into_words or add_prefix_space)
-            and (len(text) > 0 and not text[0].isspace())
-            and sum(text.startswith(no_split_token) for no_split_token in self.added_tokens_encoder) == 0
-        ):
-            text = " " + text
-        return (text, kwargs)
+        # Call _post_init for tokenizers created directly (not from_pretrained)
+        # For from_pretrained, this will be called again after loading the tokenizer from file
+        self._post_init()
 
     @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.__call__
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
@@ -681,7 +442,6 @@ def _is_valid_text_input(t):
             )
 
     @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.batch_encode_plus
     def batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
@@ -742,191 +502,15 @@ def batch_encode_plus(
             **kwargs,
         )
 
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._batch_encode_plus
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
-
-        batch_outputs = self._batch_prepare_for_model(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
         )
 
-        return BatchEncoding(batch_outputs)
+        return encodings[0].tokens if encodings else []
 
     @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._batch_prepare_for_model
-    def _batch_prepare_for_model(
-        self,
-        batch_text_or_text_pairs,
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens.
-
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
-
-        batch_outputs = {}
-        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
-            batch_text_or_text_pair, boxes_example = example
-            outputs = self.prepare_for_model(
-                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
-                batch_text_or_text_pair[1] if is_pair else None,
-                boxes_example,
-                word_labels=word_labels[idx] if word_labels is not None else None,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
-                return_token_type_ids=return_token_type_ids,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
-                verbose=verbose,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-        return batch_outputs
-
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode
-    def encode(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> list[int]:
-        encoded_inputs = self.encode_plus(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return encoded_inputs["input_ids"]
-
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.encode_plus
     def encode_plus(
         self,
         text: Union[TextInput, PreTokenizedInput],
@@ -995,13 +579,16 @@ def encode_plus(
             **kwargs,
         )
 
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._encode_plus
-    def _encode_plus(
+    def _batch_encode_plus(
         self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
@@ -1009,7 +596,7 @@ def _encode_plus(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1017,54 +604,151 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
-        return self.prepare_for_model(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
-            return_tensors=return_tensors,
-            prepend_batch_axis=True,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            verbose=verbose,
         )
 
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def prepare_for_model(
+        if is_pair:
+            batch_text_or_text_pairs = [
+                (text.split() if isinstance(text, str) else text, text_pair)
+                for text, text_pair in batch_text_or_text_pairs
+            ]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv3 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        # For RoBERTa, there might be additional sep tokens
+                        token_boxes_example.append(self.sep_token_box)
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                previous_word_id = None
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            # Check if this is the first token of the word (word_id changed or is first occurrence)
+                            if word_id != previous_word_id:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                        previous_word_id = word_id
+                    else:
+                        labels_example.append(self.pad_token_label)
+                        previous_word_id = None
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
         self,
         text: Union[TextInput, PreTokenizedInput],
         text_pair: Optional[PreTokenizedInput] = None,
         boxes: Optional[list[list[int]]] = None,
         word_labels: Optional[list[int]] = None,
         add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1072,326 +756,53 @@ def prepare_for_model(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        prepend_batch_axis: bool = False,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
-        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
-        combination of arguments will raise an error.
-
-        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
-        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
-        labeled with -100, such that they will be ignored by the loss function.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
+            stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
             verbose=verbose,
             **kwargs,
         )
 
-        tokens = []
-        pair_tokens = []
-        token_boxes = []
-        pair_token_boxes = []
-        labels = []
-
-        if text_pair is None:
-            if word_labels is None:
-                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
-                for word, box in zip(text, boxes):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-            else:
-                # CASE 2: token classification (training)
-                for word, box, label in zip(text, boxes, word_labels):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-                    if self.only_label_first_subword:
-                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
-                    else:
-                        labels.extend([label] * len(word_tokens))
-        else:
-            # CASE 3: document visual question answering (inference)
-            # text = question
-            # text_pair = words
-            tokens = self.tokenize(text)
-            token_boxes = [self.pad_token_box for _ in range(len(tokens))]
-
-            for word, box in zip(text_pair, boxes):
-                if len(word) < 1:  # skip empty words
-                    continue
-                word_tokens = self.tokenize(word)
-                pair_tokens.extend(word_tokens)
-                pair_token_boxes.extend([box] * len(word_tokens))
-
-        # Create ids + pair_ids
-        ids = self.convert_tokens_to_ids(tokens)
-        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
-
-        if (
-            return_overflowing_tokens
-            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-            and pair_ids is not None
-        ):
-            raise ValueError(
-                "Not possible to return overflowing tokens for pair of sequences with the "
-                "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                "for instance `only_second` or `only_first`."
-            )
-
-        # Compute the total size of the returned encodings
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            (
-                ids,
-                token_boxes,
-                pair_ids,
-                pair_token_boxes,
-                labels,
-                overflowing_tokens,
-                overflowing_token_boxes,
-                overflowing_labels,
-            ) = self.truncate_sequences(
-                ids,
-                token_boxes,
-                pair_ids=pair_ids,
-                pair_token_boxes=pair_token_boxes,
-                labels=labels,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
             )
 
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
 
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
-            encoded_inputs["overflowing_labels"] = overflowing_labels
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
-            if pair_token_boxes:
-                pair_token_boxes = [self.sep_token_box] + pair_token_boxes + [self.sep_token_box]
-            token_boxes = token_boxes + pair_token_boxes if pair else token_boxes
-            if labels:
-                labels = [self.pad_token_label] + labels + [self.pad_token_label]
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-            token_boxes = token_boxes + pair_token_boxes if pair else token_boxes
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["bbox"] = token_boxes
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+        return batched_output
 
-        if labels:
-            encoded_inputs["labels"] = labels
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer.truncate_sequences
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        token_boxes: list[list[int]],
-        pair_ids: Optional[list[int]] = None,
-        pair_token_boxes: Optional[list[list[int]]] = None,
-        labels: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Truncates a sequence pair in-place following the strategy.
-
-        Args:
-            ids (`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            token_boxes (`List[List[int]]`):
-                Bounding boxes of the first sequence.
-            pair_ids (`List[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            pair_token_boxes (`List[List[int]]`, *optional*):
-                Bounding boxes of the second sequence.
-            labels (`List[int]`, *optional*):
-                Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                The strategy to follow for truncation. Can be:
-
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-
-        Returns:
-            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
-            of sequences (or a batch of pairs) is provided.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
-
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
-            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
-        ):
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                overflowing_tokens = ids[-window_len:]
-                overflowing_token_boxes = token_boxes[-window_len:]
-                overflowing_labels = labels[-window_len:]
-                ids = ids[:-num_tokens_to_remove]
-                token_boxes = token_boxes[:-num_tokens_to_remove]
-                labels = labels[:-num_tokens_to_remove]
-            else:
-                error_msg = (
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                )
-                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
-                    error_msg = (
-                        error_msg + "Please select another truncation strategy than "
-                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
-                    )
-                logger.error(error_msg)
-        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            logger.warning(
-                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
-                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
-                "truncation strategy. So the returned list will always be empty even if some "
-                "tokens have been removed."
-            )
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    ids = ids[:-1]
-                    token_boxes = token_boxes[:-1]
-                    labels = labels[:-1]
-                else:
-                    pair_ids = pair_ids[:-1]
-                    pair_token_boxes = pair_token_boxes[:-1]
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                overflowing_tokens = pair_ids[-window_len:]
-                overflowing_token_boxes = pair_token_boxes[-window_len:]
-                pair_ids = pair_ids[:-num_tokens_to_remove]
-                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
-
-        return (
-            ids,
-            token_boxes,
-            pair_ids,
-            pair_token_boxes,
-            labels,
-            overflowing_tokens,
-            overflowing_token_boxes,
-            overflowing_labels,
-        )
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer._pad
     def _pad(
         self,
         encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
@@ -1481,5 +892,31 @@ def _pad(
 
         return encoded_inputs
 
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoBERTa sequence has the following format:
+
+        - single sequence: ` X `
+        - pair of sequences: ` A  B `
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+
+__all__ = ["LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast"]
 
-__all__ = ["LayoutLMv3Tokenizer"]
+# Backward alias
+LayoutLMv3TokenizerFast = LayoutLMv3Tokenizer
diff --git a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py b/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
deleted file mode 100644
index d0407638595d..000000000000
--- a/src/transformers/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ /dev/null
@@ -1,848 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fast tokenization class for LayoutLMv3. It overwrites 2 methods of the slow tokenizer class, namely _batch_encode_plus
-and _encode_plus, in which the Rust tokenizer is used.
-"""
-
-import json
-from typing import Optional, Union
-
-from tokenizers import processors
-
-from ...tokenization_utils_base import (
-    BatchEncoding,
-    EncodedInput,
-    PaddingStrategy,
-    PreTokenizedInput,
-    TensorType,
-    TextInput,
-    TextInputPair,
-    TruncationStrategy,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import add_end_docstrings, logging
-from .tokenization_layoutlmv3 import (
-    LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING,
-    LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
-    LayoutLMv3Tokenizer,
-)
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class LayoutLMv3TokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" LayoutLMv3 tokenizer (backed by HuggingFace's *tokenizers* library). Based on BPE.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
-        trim_offsets (`bool`, *optional*, defaults to `True`):
-            Whether the post processing step should trim offsets to avoid including whitespaces.
-        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [CLS] token.
-        sep_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [SEP] token.
-        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [PAD] token.
-        pad_token_label (`int`, *optional*, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
-            CrossEntropyLoss.
-        only_label_first_subword (`bool`, *optional*, defaults to `True`):
-            Whether or not to only label the first subword, in case word labels are provided.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = LayoutLMv3Tokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=True,
-        trim_offsets=True,
-        cls_token_box=[0, 0, 0, 0],
-        sep_token_box=[0, 0, 0, 0],
-        pad_token_box=[0, 0, 0, 0],
-        pad_token_label=-100,
-        only_label_first_subword=True,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            **kwargs,
-        )
-
-        tokenizer_component = "post_processor"
-        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
-        if tokenizer_component_instance:
-            state = json.loads(tokenizer_component_instance.__getstate__())
-
-            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
-            if "sep" in state:
-                state["sep"] = tuple(state["sep"])
-            if "cls" in state:
-                state["cls"] = tuple(state["cls"])
-
-            changes_to_apply = False
-
-            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-                state["add_prefix_space"] = add_prefix_space
-                changes_to_apply = True
-
-            if state.get("trim_offsets", trim_offsets) != trim_offsets:
-                state["trim_offsets"] = trim_offsets
-                changes_to_apply = True
-
-            if changes_to_apply:
-                component_class = getattr(processors, state.pop("type"))
-                new_value = component_class(**state)
-                setattr(self.backend_tokenizer, tokenizer_component, new_value)
-
-        # additional properties
-        self.cls_token_box = cls_token_box
-        self.sep_token_box = sep_token_box
-        self.pad_token_box = pad_token_box
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
-
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.__call__
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with word-level normalized bounding boxes and optional labels.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                words).
-            text_pair (`List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
-                (pretokenized string).
-            boxes (`List[List[int]]`, `List[List[List[int]]]`):
-                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (`List[int]`, `List[List[int]]`, *optional*):
-                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
-        """
-
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if text_pair is not None:
-            # in case text + text_pair are provided, text = questions, text_pair = words
-            if not _is_valid_text_input(text):
-                raise ValueError("text input must of type `str` (single example) or `List[str]` (batch of examples). ")
-            if not isinstance(text_pair, (list, tuple)):
-                raise ValueError(
-                    "Words must be of type `List[str]` (single pretokenized example), "
-                    "or `List[List[str]]` (batch of pretokenized examples)."
-                )
-        else:
-            # in case only text is provided => must be words
-            if not isinstance(text, (list, tuple)):
-                raise ValueError(
-                    "Words must be of type `List[str]` (single pretokenized example), "
-                    "or `List[List[str]]` (batch of pretokenized examples)."
-                )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-
-        words = text if text_pair is None else text_pair
-        if boxes is None:
-            raise ValueError("You must provide corresponding bounding boxes")
-        if is_batched:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide words and boxes for an equal amount of examples")
-            for words_example, boxes_example in zip(words, boxes):
-                if len(words_example) != len(boxes_example):
-                    raise ValueError("You must provide as many words as there are bounding boxes")
-        else:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide as many words as there are bounding boxes")
-
-        if is_batched:
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            is_pair = bool(text_pair is not None)
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                is_pair=is_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.batch_encode_plus
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._batch_encode_plus(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.tokenize
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        batched_input = [(text, pair)] if pair else [text]
-        encodings = self._tokenizer.encode_batch(
-            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
-        )
-
-        return encodings[0].tokens
-
-    @add_end_docstrings(LAYOUTLMV3_ENCODE_KWARGS_DOCSTRING, LAYOUTLMV3_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.encode_plus
-    def encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        `__call__` should be used instead.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`List[str]` or `List[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus(
-            text=text,
-            boxes=boxes,
-            text_pair=text_pair,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        if not isinstance(batch_text_or_text_pairs, list):
-            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
-
-        # Set the truncation and padding strategy and restore the initial configuration
-        self.set_truncation_and_padding(
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-        )
-
-        if is_pair:
-            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
-
-        encodings = self._tokenizer.encode_batch(
-            batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            is_pretokenized=True,  # we set this to True as LayoutLMv3 always expects pretokenized inputs
-        )
-
-        # Convert encoding to dict
-        # `Tokens` has type: tuple[
-        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
-        #                       list[EncodingFast]
-        #                    ]
-        # with nested dimensions corresponding to batch, overflows, sequence length
-        tokens_and_encodings = [
-            self._convert_encoding(
-                encoding=encoding,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=True
-                if word_labels is not None
-                else return_offsets_mapping,  # we use offsets to create the labels
-                return_length=return_length,
-                verbose=verbose,
-            )
-            for encoding in encodings
-        ]
-
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
-        sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0]:
-            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
-            sanitized_tokens[key] = stack
-        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
-
-        # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
-        if return_overflowing_tokens:
-            overflow_to_sample_mapping = []
-            for i, (toks, _) in enumerate(tokens_and_encodings):
-                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
-            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-
-        for input_ids in sanitized_tokens["input_ids"]:
-            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-
-        # create the token boxes
-        token_boxes = []
-        for batch_index in range(len(sanitized_tokens["input_ids"])):
-            if return_overflowing_tokens:
-                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-            else:
-                original_index = batch_index
-            token_boxes_example = []
-            for id, sequence_id, word_id in zip(
-                sanitized_tokens["input_ids"][batch_index],
-                sanitized_encodings[batch_index].sequence_ids,
-                sanitized_encodings[batch_index].word_ids,
-            ):
-                if word_id is not None:
-                    if is_pair and sequence_id == 0:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        token_boxes_example.append(boxes[original_index][word_id])
-                else:
-                    if id == self.cls_token_id:
-                        token_boxes_example.append(self.cls_token_box)
-                    elif id == self.sep_token_id:
-                        token_boxes_example.append(self.sep_token_box)
-                    elif id == self.pad_token_id:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        raise ValueError("Id not recognized")
-            token_boxes.append(token_boxes_example)
-
-        sanitized_tokens["bbox"] = token_boxes
-
-        # optionally, create the labels
-        if word_labels is not None:
-            labels = []
-            for batch_index in range(len(sanitized_tokens["input_ids"])):
-                if return_overflowing_tokens:
-                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-                else:
-                    original_index = batch_index
-                labels_example = []
-                previous_token_empty = False
-                for id, offset, word_id in zip(
-                    sanitized_tokens["input_ids"][batch_index],
-                    sanitized_tokens["offset_mapping"][batch_index],
-                    sanitized_encodings[batch_index].word_ids,
-                ):
-                    if word_id is not None:
-                        if self.only_label_first_subword:
-                            if offset[0] == 0 and not previous_token_empty:
-                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                                labels_example.append(word_labels[original_index][word_id])
-                            else:
-                                labels_example.append(self.pad_token_label)
-                            if offset == (0, 0):
-                                previous_token_empty = True
-                            else:
-                                previous_token_empty = False
-                        else:
-                            labels_example.append(word_labels[original_index][word_id])
-                    else:
-                        labels_example.append(self.pad_token_label)
-                labels.append(labels_example)
-
-            sanitized_tokens["labels"] = labels
-            # finally, remove offsets if the user didn't want them
-            if not return_offsets_mapping:
-                del sanitized_tokens["offset_mapping"]
-
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._encode_plus
-    def _encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # make it a batched input
-        # 2 options:
-        # 1) only text, in case text must be a list of str
-        # 2) text + text_pair, in which case text = str and text_pair a list of str
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_boxes = [boxes]
-        batched_word_labels = [word_labels] if word_labels is not None else None
-        batched_output = self._batch_encode_plus(
-            batched_input,
-            is_pair=bool(text_pair is not None),
-            boxes=batched_boxes,
-            word_labels=batched_word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
-            batched_output = BatchEncoding(
-                {
-                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
-                    for key, value in batched_output.items()
-                },
-                batched_output.encodings,
-            )
-
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
-        return batched_output
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast._pad
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side:
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            padding_side = padding_side if padding_side is not None else self.padding_side
-            if padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(padding_side))
-
-        return encoded_inputs
-
-    # Copied from transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Args:
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not:
-        make use of token type ids, therefore a list of zeros is returned.
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-
-__all__ = ["LayoutLMv3TokenizerFast"]
diff --git a/src/transformers/models/layoutxlm/__init__.py b/src/transformers/models/layoutxlm/__init__.py
index 9b338ce14185..b841f63c70b9 100644
--- a/src/transformers/models/layoutxlm/__init__.py
+++ b/src/transformers/models/layoutxlm/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_layoutxlm import *
     from .processing_layoutxlm import *
     from .tokenization_layoutxlm import *
-    from .tokenization_layoutxlm_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 9c1d5c05a9f9..09b9d1098c12 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -14,13 +14,12 @@
 # limitations under the License
 """Tokenization classes for LayoutXLM model."""
 
-import os
-from shutil import copyfile
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_python import AddedToken
 from ...tokenization_utils_base import (
     BatchEncoding,
     EncodedInput,
@@ -28,17 +27,15 @@
     TextInput,
     TextInputPair,
     TruncationStrategy,
+    _get_prepend_scheme,
 )
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
-from ...utils.import_utils import requires
-from ..xlm_roberta.tokenization_xlm_roberta import (
-    SPIECE_UNDERLINE,
-    VOCAB_FILES_NAMES,
-)
 
 
 logger = logging.get_logger(__name__)
 
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
 LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
@@ -143,18 +140,18 @@
 """
 
 
-@requires(backends=("sentencepiece",))
-class LayoutXLMTokenizer(PreTrainedTokenizer):
+class LayoutXLMTokenizer(TokenizersBackend):
     """
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
+        vocab (`list[tuple[str, float]]`, *optional*):
+            Vocabulary for the tokenizer as a list of (token, score) tuples.
         bos_token (`str`, *optional*, defaults to `""`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
@@ -201,33 +198,20 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
             CrossEntropyLoss.
         only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an initial space to the input.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
+        vocab_file=None,
+        vocab=None,
         bos_token="",
         eos_token="",
         sep_token="",
@@ -240,32 +224,65 @@ def __init__(
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        add_prefix_space=True,
         **kwargs,
-    ) -> None:
+    ):
         # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
+        self.add_prefix_space = add_prefix_space
 
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
+        # Build vocab from list of tuples if provided, else use default
+        # Handle both list of tuples (when creating) and dict (when loading)
+        if vocab is not None:
+            if isinstance(vocab, dict):
+                # Convert dict to list of tuples
+                self._vocab = [(token, score) for token, score in vocab.items()]
+            else:
+                self._vocab = vocab
+        else:
+            self._vocab = [
+                ("", 0.0),
+                ("", 0.0),
+                ("", 0.0),
+                ("", 0.0),
+            ]
+            if mask_token not in [v[0] for v in self._vocab]:
+                self._vocab.append((str(mask_token), 0.0))
+
+        # Create the Unigram tokenizer
+        self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=3, byte_fallback=False))
+
+        # Set up normalizer (strip right, replace multiple spaces)
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Strip(left=False, right=True),
+                normalizers.Replace(Regex(" {2,}"), "▁"),
+            ]
+        )
 
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
+        # Set up pre_tokenizer (Metaspace)
+        prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+
+        # Set up decoder
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+
+        # Set up post_processor for XLM-RoBERTa style
+        # Get token IDs
+        cls_token_id = self._get_token_id(str(cls_token))
+        sep_token_id = self._get_token_id(str(sep_token))
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=" $A ",
+            pair=" $A   $B ",
+            special_tokens=[
+                ("", cls_token_id),
+                ("", sep_token_id),
+            ],
+        )
 
-        self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        tokenizer_object = self._tokenizer
 
         # additional properties
         self.cls_token_box = cls_token_box
@@ -275,165 +292,155 @@ def __init__(
         self.only_label_first_subword = only_label_first_subword
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
+            vocab_file=vocab_file,
+            vocab=vocab,
+            add_prefix_space=add_prefix_space,
             cls_token_box=cls_token_box,
             sep_token_box=sep_token_box,
             pad_token_box=pad_token_box,
             pad_token_label=pad_token_label,
             only_label_first_subword=only_label_first_subword,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
+        self.vocab_file = vocab_file
 
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
+    def _get_token_id(self, token: str) -> int:
+        """Helper to get token ID from vocab."""
+        for i, (t, _) in enumerate(self._vocab):
+            if t == token:
+                return i
+        return 3  # unk_id
 
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
         """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        Tokenize and prepare for the model a sequence or a pair of sequences.
         """
+        # Get the padding and truncation strategies
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
 
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return self._encode_plus(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
 
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
         """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
+        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
         """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset + 1  # Add the  token
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        # Get the padding and truncation strategies
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
 
     @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
     def __call__(
@@ -587,6 +594,21 @@ def _is_valid_text_input(t):
                 **kwargs,
             )
 
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+
+        # Handle split_special_tokens parameter
+        # If split_special_tokens=True, we want encode_special_tokens=True (split the special tokens)
+        # If split_special_tokens=False, we want encode_special_tokens=False (keep special tokens whole)
+        split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+        self._tokenizer.encode_special_tokens = split_special_tokens
+
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[
@@ -604,7 +626,7 @@ def _batch_encode_plus(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -614,109 +636,137 @@ def _batch_encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
-        batch_outputs = self._batch_prepare_for_model(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
             padding_strategy=padding_strategy,
             truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
         )
 
-        return BatchEncoding(batch_outputs)
-
-    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
-    def _batch_prepare_for_model(
-        self,
-        batch_text_or_text_pairs,
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
 
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
 
-        batch_outputs = {}
-        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
-            batch_text_or_text_pair, boxes_example = example
-            outputs = self.prepare_for_model(
-                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
-                batch_text_or_text_pair[1] if is_pair else None,
-                boxes_example,
-                word_labels=word_labels[idx] if word_labels is not None else None,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
+        # Convert encoding to dict
+        # `Tokens` has type: tuple[
+        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #                       list[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
                 return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
                 return_overflowing_tokens=return_overflowing_tokens,
                 return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
                 return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
                 verbose=verbose,
             )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
 
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
 
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
 
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+        sanitized_tokens["bbox"] = token_boxes
 
-        return batch_outputs
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
 
     def _encode_plus(
         self,
@@ -731,7 +781,7 @@ def _encode_plus(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -741,361 +791,50 @@ def _encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
-
-        return self.prepare_for_model(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
             add_special_tokens=add_special_tokens,
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
             return_tensors=return_tensors,
-            prepend_batch_axis=True,
-            return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
             return_overflowing_tokens=return_overflowing_tokens,
             return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
             return_length=return_length,
             verbose=verbose,
-        )
-
-    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
-    def prepare_for_model(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        prepend_batch_axis: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens.
-
-        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
-        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
-        labeled with -100, such that they will be ignored by the loss function.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
             **kwargs,
         )
 
-        tokens = []
-        pair_tokens = []
-        token_boxes = []
-        pair_token_boxes = []
-        labels = []
-
-        if text_pair is None:
-            if word_labels is None:
-                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
-                for word, box in zip(text, boxes):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-            else:
-                # CASE 2: token classification (training)
-                for word, box, label in zip(text, boxes, word_labels):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-                    if self.only_label_first_subword:
-                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
-                    else:
-                        labels.extend([label] * len(word_tokens))
-        else:
-            # CASE 3: document visual question answering (inference)
-            # text = question
-            # text_pair = words
-            tokens = self.tokenize(text)
-            token_boxes = [self.pad_token_box for _ in range(len(tokens))] + [self.sep_token_box]
-
-            for word, box in zip(text_pair, boxes):
-                if len(word) < 1:  # skip empty words
-                    continue
-                word_tokens = self.tokenize(word)
-                pair_tokens.extend(word_tokens)
-                pair_token_boxes.extend([box] * len(word_tokens))
-
-        # Create ids + pair_ids
-        ids = self.convert_tokens_to_ids(tokens)
-        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
-
-        # Compute the total size of the returned encodings
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            (
-                ids,
-                token_boxes,
-                pair_ids,
-                pair_token_boxes,
-                labels,
-                overflowing_tokens,
-                overflowing_token_boxes,
-                overflowing_labels,
-            ) = self.truncate_sequences(
-                ids,
-                token_boxes,
-                pair_ids=pair_ids,
-                pair_token_boxes=pair_token_boxes,
-                labels=labels,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
             )
 
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
-            encoded_inputs["overflowing_labels"] = overflowing_labels
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
-            if pair_token_boxes:
-                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
-            if labels:
-                labels = [self.pad_token_label] + labels + [self.pad_token_label]
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-        if labels:
-            encoded_inputs["labels"] = labels
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        token_boxes: list[list[int]],
-        pair_ids: Optional[list[int]] = None,
-        pair_token_boxes: Optional[list[list[int]]] = None,
-        labels: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Truncates a sequence pair in-place following the strategy.
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
 
-        Args:
-            ids (`list[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            token_boxes (`list[list[int]]`):
-                Bounding boxes of the first sequence.
-            pair_ids (`list[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            pair_token_boxes (`list[list[int]]`, *optional*):
-                Bounding boxes of the second sequence.
-            labels (`list[int]`, *optional*):
-                Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                The strategy to follow for truncation. Can be:
-
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-
-        Returns:
-            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
-
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    if not overflowing_tokens:
-                        window_len = min(len(ids), stride + 1)
-                    else:
-                        window_len = 1
-                    overflowing_tokens.extend(ids[-window_len:])
-                    overflowing_token_boxes.extend(token_boxes[-window_len:])
-                    overflowing_labels.extend(labels[-window_len:])
-                    ids = ids[:-1]
-                    token_boxes = token_boxes[:-1]
-                    labels = labels[:-1]
-                else:
-                    if not overflowing_tokens:
-                        window_len = min(len(pair_ids), stride + 1)
-                    else:
-                        window_len = 1
-                    overflowing_tokens.extend(pair_ids[-window_len:])
-                    overflowing_token_boxes.extend(pair_token_boxes[-window_len:])
-                    pair_ids = pair_ids[:-1]
-                    pair_token_boxes = pair_token_boxes[:-1]
-        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                overflowing_tokens = ids[-window_len:]
-                overflowing_token_boxes = token_boxes[-window_len:]
-                overflowing_labels = labels[-window_len:]
-                ids = ids[:-num_tokens_to_remove]
-                token_boxes = token_boxes[:-num_tokens_to_remove]
-                labels = labels[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_second'."
-                )
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                overflowing_tokens = pair_ids[-window_len:]
-                overflowing_token_boxes = pair_token_boxes[-window_len:]
-                pair_ids = pair_ids[:-num_tokens_to_remove]
-                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
-
-        return (
-            ids,
-            token_boxes,
-            pair_ids,
-            pair_token_boxes,
-            labels,
-            overflowing_tokens,
-            overflowing_token_boxes,
-            overflowing_labels,
-        )
+        return batched_output
 
     def _pad(
         self,
@@ -1186,5 +925,56 @@ def _pad(
 
         return encoded_inputs
 
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: ` X `
+        - pair of sequences: ` A  B `
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
 
 __all__ = ["LayoutXLMTokenizer"]
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
deleted file mode 100644
index 7b08a3aa5f0e..000000000000
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ /dev/null
@@ -1,814 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-"""Tokenization classes for LayoutXLM model."""
-
-import os
-from shutil import copyfile
-from typing import Optional, Union
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_base import (
-    BatchEncoding,
-    EncodedInput,
-    PreTokenizedInput,
-    TextInput,
-    TextInputPair,
-    TruncationStrategy,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
-from ..xlm_roberta.tokenization_xlm_roberta_fast import (
-    VOCAB_FILES_NAMES,
-)
-
-
-if is_sentencepiece_available():
-    from .tokenization_layoutxlm import LayoutXLMTokenizer
-else:
-    LayoutXLMTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Activates and controls padding. Accepts the following values:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                Activates and controls truncation. Accepts the following values:
-
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  sequences (or a batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  greater than the model maximum admissible input size).
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-
-                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
-                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            stride (`int`, *optional*, defaults to 0):
-                If set to a number along with `max_length`, the overflowing tokens returned when
-                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
-                returned to provide some overlap between truncated and overflowing sequences. The value of this
-                argument defines the number of overlapping tokens.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            return_token_type_ids (`bool`, *optional*):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the `return_outputs` attribute.
-
-                [What are token type IDs?](../glossary#token-type-ids)
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the `return_outputs` attribute.
-
-                [What are attention masks?](../glossary#attention-mask)
-            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
-                of returning overflowing tokens.
-            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
-                Whether or not to return special tokens mask information.
-            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
-                Whether or not to return `(char_start, char_end)` for each token.
-
-                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
-                Python's tokenizer, this method will raise `NotImplementedError`.
-            return_length  (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the lengths of the encoded inputs.
-            verbose (`bool`, *optional*, defaults to `True`):
-                Whether or not to print more information and warnings.
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model.
-
-              [What are input IDs?](../glossary#input-ids)
-
-            - **bbox** -- List of bounding boxes to be fed to a model.
-
-            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
-              if *"token_type_ids"* is in `self.model_input_names`).
-
-              [What are token type IDs?](../glossary#token-type-ids)
-
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
-
-              [What are attention masks?](../glossary#attention-mask)
-
-            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when `return_length=True`).
-"""
-
-
-class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
-    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [CLS] token.
-        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
-            The bounding box to use for the special [SEP] token.
-        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [PAD] token.
-        pad_token_label (`int`, *optional*, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
-            CrossEntropyLoss.
-        only_label_first_subword (`bool`, *optional*, defaults to `True`):
-            Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = LayoutXLMTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        cls_token_box=[0, 0, 0, 0],
-        sep_token_box=[1000, 1000, 1000, 1000],
-        pad_token_box=[0, 0, 0, 0],
-        pad_token_label=-100,
-        only_label_first_subword=True,
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            cls_token_box=cls_token_box,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-        # additional properties
-        self.cls_token_box = cls_token_box
-        self.sep_token_box = sep_token_box
-        self.pad_token_box = pad_token_box
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
-
-    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with word-level normalized bounding boxes and optional labels.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                words).
-            text_pair (`list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
-                (pretokenized string).
-            boxes (`list[list[int]]`, `list[list[list[int]]]`):
-                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (`list[int]`, `list[list[int]]`, *optional*):
-                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
-        """
-
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if text_pair is not None:
-            # in case text + text_pair are provided, text = questions, text_pair = words
-            if not _is_valid_text_input(text):
-                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
-            if not isinstance(text_pair, (list, tuple)):
-                raise ValueError(
-                    "words must of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-        else:
-            # in case only text is provided => must be words
-            if not isinstance(text, (list, tuple)):
-                raise ValueError(
-                    "Words must of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-
-        words = text if text_pair is None else text_pair
-        if boxes is None:
-            raise ValueError("You must provide corresponding bounding boxes")
-        if is_batched:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide words and boxes for an equal amount of examples")
-            for words_example, boxes_example in zip(words, boxes):
-                if len(words_example) != len(boxes_example):
-                    raise ValueError("You must provide as many words as there are bounding boxes")
-        else:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide as many words as there are bounding boxes")
-
-        if is_batched:
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            is_pair = bool(text_pair is not None)
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                is_pair=is_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        batched_input = [(text, pair)] if pair else [text]
-
-        self._tokenizer.encode_special_tokens = kwargs.pop(
-            "split_special_tokens", self._tokenizer.encode_special_tokens
-        )
-
-        encodings = self._tokenizer.encode_batch(
-            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
-        )
-
-        return encodings[0].tokens
-
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if not isinstance(batch_text_or_text_pairs, list):
-            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
-
-        # Set the truncation and padding strategy and restore the initial configuration
-        self.set_truncation_and_padding(
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-        )
-
-        if is_pair:
-            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
-
-        encodings = self._tokenizer.encode_batch(
-            batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
-        )
-
-        # Convert encoding to dict
-        # `Tokens` has type: tuple[
-        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
-        #                       list[EncodingFast]
-        #                    ]
-        # with nested dimensions corresponding to batch, overflows, sequence length
-        tokens_and_encodings = [
-            self._convert_encoding(
-                encoding=encoding,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=True
-                if word_labels is not None
-                else return_offsets_mapping,  # we use offsets to create the labels
-                return_length=return_length,
-                verbose=verbose,
-            )
-            for encoding in encodings
-        ]
-
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
-        sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0]:
-            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
-            sanitized_tokens[key] = stack
-        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
-
-        # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
-        if return_overflowing_tokens:
-            overflow_to_sample_mapping = []
-            for i, (toks, _) in enumerate(tokens_and_encodings):
-                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
-            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-
-        for input_ids in sanitized_tokens["input_ids"]:
-            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-
-        # create the token boxes
-        token_boxes = []
-        for batch_index in range(len(sanitized_tokens["input_ids"])):
-            if return_overflowing_tokens:
-                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-            else:
-                original_index = batch_index
-            token_boxes_example = []
-            for id, sequence_id, word_id in zip(
-                sanitized_tokens["input_ids"][batch_index],
-                sanitized_encodings[batch_index].sequence_ids,
-                sanitized_encodings[batch_index].word_ids,
-            ):
-                if word_id is not None:
-                    if is_pair and sequence_id == 0:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        token_boxes_example.append(boxes[original_index][word_id])
-                else:
-                    if id == self.cls_token_id:
-                        token_boxes_example.append(self.cls_token_box)
-                    elif id == self.sep_token_id:
-                        token_boxes_example.append(self.sep_token_box)
-                    elif id == self.pad_token_id:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        raise ValueError("Id not recognized")
-            token_boxes.append(token_boxes_example)
-
-        sanitized_tokens["bbox"] = token_boxes
-
-        # optionally, create the labels
-        if word_labels is not None:
-            labels = []
-            for batch_index in range(len(sanitized_tokens["input_ids"])):
-                if return_overflowing_tokens:
-                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-                else:
-                    original_index = batch_index
-                labels_example = []
-                for id, offset, word_id in zip(
-                    sanitized_tokens["input_ids"][batch_index],
-                    sanitized_tokens["offset_mapping"][batch_index],
-                    sanitized_encodings[batch_index].word_ids,
-                ):
-                    if word_id is not None:
-                        if self.only_label_first_subword:
-                            if offset[0] == 0:
-                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                                labels_example.append(word_labels[original_index][word_id])
-                            else:
-                                labels_example.append(self.pad_token_label)
-                        else:
-                            labels_example.append(word_labels[original_index][word_id])
-                    else:
-                        labels_example.append(self.pad_token_label)
-                labels.append(labels_example)
-
-            sanitized_tokens["labels"] = labels
-            # finally, remove offsets if the user didn't want them
-            if not return_offsets_mapping:
-                del sanitized_tokens["offset_mapping"]
-
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
-
-    def _encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # make it a batched input
-        # 2 options:
-        # 1) only text, in case text must be a list of str
-        # 2) text + text_pair, in which case text = str and text_pair a list of str
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_boxes = [boxes]
-        batched_word_labels = [word_labels] if word_labels is not None else None
-        batched_output = self._batch_encode_plus(
-            batched_input,
-            is_pair=bool(text_pair is not None),
-            boxes=batched_boxes,
-            word_labels=batched_word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
-            batched_output = BatchEncoding(
-                {
-                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
-                    for key, value in batched_output.items()
-                },
-                batched_output.encodings,
-            )
-
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
-        return batched_output
-
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side (`str`, *optional*):
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            padding_side = padding_side if padding_side is not None else self.padding_side
-            if padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(padding_side))
-
-        return encoded_inputs
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["LayoutXLMTokenizerFast"]
diff --git a/src/transformers/models/led/__init__.py b/src/transformers/models/led/__init__.py
index 678b3af02aaf..a9609d25e71c 100644
--- a/src/transformers/models/led/__init__.py
+++ b/src/transformers/models/led/__init__.py
@@ -18,10 +18,9 @@
 
 
 if TYPE_CHECKING:
+    from ..roberta.tokenization_roberta import RobertaTokenizer as LEDTokenizer
     from .configuration_led import *
     from .modeling_led import *
-    from .tokenization_led import *
-    from .tokenization_led_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
deleted file mode 100644
index d110ac30d969..000000000000
--- a/src/transformers/models/led/tokenization_led.py
+++ /dev/null
@@ -1,454 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for LED."""
-
-import json
-import os
-from functools import lru_cache
-from typing import Optional, Union
-
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...tokenization_utils_base import BatchEncoding, EncodedInput
-from ...utils import PaddingStrategy, logging
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-# See all LED models at https://huggingface.co/models?filter=LED
-
-
-@lru_cache
-# Copied from transformers.models.bart.tokenization_bart.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.bart.tokenization_bart.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class LEDTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a LED tokenizer, which is smilar to the ROBERTa tokenizer, using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import LEDTokenizer
-
-    >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (BART tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.__init__
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
-
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-    @property
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.vocab_size
-    def vocab_size(self):
-        return len(self.encoder)
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.bpe
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._tokenize
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.build_inputs_with_special_tokens with BART->LED
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A LED sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.create_token_type_ids_from_sequences with BART->LED
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. LED does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    # Copied from transformers.models.bart.tokenization_bart.BartTokenizer.prepare_for_tokenization
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        encoded_inputs = super()._pad(
-            encoded_inputs=encoded_inputs,
-            max_length=max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        if return_attention_mask and "global_attention_mask" in encoded_inputs:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            # `global_attention_mask` need to have the same length as other (sequential) inputs.
-            needs_to_be_padded = len(encoded_inputs["global_attention_mask"]) != len(required_input)
-
-            if needs_to_be_padded:
-                difference = len(required_input) - len(encoded_inputs["global_attention_mask"])
-
-                if self.padding_side == "right":
-                    # Use `-1` since `0` in `global_attention_mask` means `local attention` instead of `not to attend`
-                    encoded_inputs["global_attention_mask"] = (
-                        encoded_inputs["global_attention_mask"] + [-1] * difference
-                    )
-                elif self.padding_side == "left":
-                    encoded_inputs["global_attention_mask"] = [-1] * difference + encoded_inputs[
-                        "global_attention_mask"
-                    ]
-                else:
-                    raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-
-        return encoded_inputs
-
-
-__all__ = ["LEDTokenizer"]
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
deleted file mode 100644
index baea10f23516..000000000000
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Iz Beltagy, Matthew E. Peters, Arman Cohan and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for LED."""
-
-import json
-from typing import Optional, Union
-
-from tokenizers import processors
-
-from ...tokenization_utils_base import AddedToken, BatchEncoding, EncodedInput
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, logging
-from .tokenization_led import LEDTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class LEDTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" LED tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
-    using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import LEDTokenizerFast
-
-    >>> tokenizer = LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (LED tokenizer detect beginning of words by the preceding space).
-        trim_offsets (`bool`, *optional*, defaults to `True`):
-            Whether the post processing step should trim offsets to avoid including whitespaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = LEDTokenizer
-    model_input_names = ["input_ids", "attention_mask"]
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.__init__
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        trim_offsets=True,
-        **kwargs,
-    ):
-        # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, normalized=True, special=True)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-            **kwargs,
-        )
-
-        # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
-        tokenizer_component = "post_processor"
-        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
-        if tokenizer_component_instance:
-            state = json.loads(tokenizer_component_instance.__getstate__())
-
-            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
-            if "sep" in state:
-                state["sep"] = tuple(state["sep"])
-            if "cls" in state:
-                state["cls"] = tuple(state["cls"])
-
-            changes_to_apply = False
-
-            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-                state["add_prefix_space"] = add_prefix_space
-                changes_to_apply = True
-
-            if state.get("trim_offsets", trim_offsets) != trim_offsets:
-                state["trim_offsets"] = trim_offsets
-                changes_to_apply = True
-
-            if changes_to_apply:
-                component_class = getattr(processors, state.pop("type"))
-                new_value = component_class(**state)
-                setattr(self.backend_tokenizer, tokenizer_component, new_value)
-
-    @property
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.mask_token with BART->LED
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-
-        LED tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the **.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @mask_token.setter
-    def mask_token(self, value):
-        """
-        Overriding the default behavior of the mask token to have it eat the space before it.
-
-        This is needed to preserve backward compatibility with all the previously used models based on LED.
-        """
-        # Mask token behave like a normal word, i.e. include the space before it
-        # So we set lstrip to True
-        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
-        self._mask_token = value
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast._batch_encode_plus
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        if is_split_into_words and not self.add_prefix_space:
-            raise ValueError(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-                "to use it with pretokenized inputs."
-            )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast._encode_plus
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        if is_split_into_words and not self.add_prefix_space:
-            raise ValueError(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-                "to use it with pretokenized inputs."
-            )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-    # Copied from transformers.models.bart.tokenization_bart_fast.BartTokenizerFast.create_token_type_ids_from_sequences with BART->LED
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. LED does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    # Copied from transformers.models.led.tokenization_led.LEDTokenizer._pad
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        encoded_inputs = super()._pad(
-            encoded_inputs=encoded_inputs,
-            max_length=max_length,
-            padding_strategy=padding_strategy,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        if return_attention_mask and "global_attention_mask" in encoded_inputs:
-            required_input = encoded_inputs[self.model_input_names[0]]
-            # `global_attention_mask` need to have the same length as other (sequential) inputs.
-            needs_to_be_padded = len(encoded_inputs["global_attention_mask"]) != len(required_input)
-
-            if needs_to_be_padded:
-                difference = len(required_input) - len(encoded_inputs["global_attention_mask"])
-
-                if self.padding_side == "right":
-                    # Use `-1` since `0` in `global_attention_mask` means `local attention` instead of `not to attend`
-                    encoded_inputs["global_attention_mask"] = (
-                        encoded_inputs["global_attention_mask"] + [-1] * difference
-                    )
-                elif self.padding_side == "left":
-                    encoded_inputs["global_attention_mask"] = [-1] * difference + encoded_inputs[
-                        "global_attention_mask"
-                    ]
-                else:
-                    raise ValueError("Invalid padding strategy:" + str(self.padding_side))
-
-        return encoded_inputs
-
-
-__all__ = ["LEDTokenizerFast"]
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 73038b9f37aa..8adb0f264faf 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -71,11 +71,15 @@ def __init__(
         chat_template: Optional[str] = None,
         **kwargs,
     ):
-        self.image_token = tokenizer.image_token
-        self.image_token_id = tokenizer.image_token_id
-        self.image_start_token = tokenizer.image_start_token
-        self.image_end_token = tokenizer.image_end_token
-        self.image_thumbnail_token = tokenizer.image_thumbnail
+        self.image_token = getattr(tokenizer, "image_token", "")
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if hasattr(tokenizer, "image_token_id")
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.image_start_token = getattr(tokenizer, "image_start_token", "<|image_start|>")
+        self.image_end_token = getattr(tokenizer, "image_end_token", "<|image_end|>")
+        self.image_thumbnail_token = getattr(tokenizer, "image_thumbnail_token", "<|img_thumbnail|>")
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
     def __call__(
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 3166111744a1..8c15b58a0ae4 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_llama import *
     from .modeling_llama import *
     from .tokenization_llama import *
-    from .tokenization_llama_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py
index 971c7c49ddaf..ac5b7b4bae7f 100644
--- a/src/transformers/models/llama/tokenization_llama.py
+++ b/src/transformers/models/llama/tokenization_llama.py
@@ -1,10 +1,5 @@
 # coding=utf-8
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+# Copyright 2020 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,49 +13,57 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tokenization classes for LLaMA."""
-
-import os
-from shutil import copyfile
-from typing import TYPE_CHECKING, Any, Optional
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers
+from tokenizers.models import BPE
 
-import sentencepiece as spm
-
-from ...convert_slow_tokenizer import import_protobuf
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import _get_prepend_scheme, generate_merges
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
-
 
-if TYPE_CHECKING:
-    from ...tokenization_utils_base import TextInput
 
 logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
-
-SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
 
 B_INST, E_INST = "[INST]", "[/INST]"
 B_SYS, E_SYS = "<>\n", "\n<>\n\n"
 
+# fmt: off
 DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
 answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
  that your responses are socially unbiased and positive in nature.
 
 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
-correct. If you don't know the answer to a question, please don't share false information."""  # fmt: skip
+correct. If you don't know the answer to a question, please don't share false information."""
+# fmt: on
 
 
-@requires(backends=("sentencepiece",))
-class LlamaTokenizer(PreTrainedTokenizer):
+class LlamaTokenizer(TokenizersBackend):
     """
-    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
-    no padding token in the original model.
+    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This uses notably ByteFallback and no normalization.
+
+    ```python
+    >>> from transformers import LlamaTokenizer
+
+    >>> tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    >>> tokenizer.encode("Hello this is a test")
+    [1, 15043, 445, 338, 263, 1243]
+    ```
+
+    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
+    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
+    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
+    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
+
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
+        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
+            extra spaces.
         unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -68,348 +71,118 @@ class LlamaTokenizer(PreTrainedTokenizer):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
         eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
             The end of sequence token.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
-            attention mechanisms or loss computation.
-        sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
         add_bos_token (`bool`, *optional*, defaults to `True`):
             Whether or not to add an `bos_token` at the start of sequences.
         add_eos_token (`bool`, *optional*, defaults to `False`):
             Whether or not to add an `eos_token` at the end of sequences.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
         use_default_system_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not the default system prompt for Llama should be used.
-        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not to add spaces between special tokens.
-        legacy (`bool`, *optional*):
-            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
-            and #25224 which includes fixes to properly handle tokens that appear after special tokens.
-            Make sure to also set `from_slow` to `True`.
+            Whether or not the default system prompt for Llama should be used
             A simple example:
-
-            - `legacy=True`:
             ```python
-            >>> from transformers import LlamaTokenizerFast
+    >>> from transformers import LlamaTokenizer
 
-            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
+            >>> tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
             >>> tokenizer.encode("Hello .") # 869 is '▁.'
             [1, 15043, 29871, 1, 869]
             ```
-            - `legacy=False`:
-            ```python
-            >>> from transformers import LlamaTokenizerFast
-
-            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
-            >>> tokenizer.encode("Hello .")  # 29889 is '.'
-            [1, 15043, 29871, 1, 29889]
-            ```
             Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
-        add_prefix_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. Again, this should be set with `from_slow=True` to make sure it's taken into account.
+        add_prefix_space (`bool`, *optional*):
+            Whether or not the tokenizer should automatically add a prefix space
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    padding_side = "left"
     model_input_names = ["input_ids", "attention_mask"]
 
     def __init__(
         self,
-        vocab_file,
+        clean_up_tokenization_spaces=False,
         unk_token="",
         bos_token="",
         eos_token="",
-        pad_token=None,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
         add_bos_token=True,
         add_eos_token=False,
-        clean_up_tokenization_spaces=False,
         use_default_system_prompt=False,
-        spaces_between_special_tokens=False,
-        legacy=None,
-        add_prefix_space=True,
+        legacy=False,
+        add_prefix_space=None,
+        vocab=None,
+        merges=None,
         **kwargs,
     ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
-
-        if legacy is None:
-            logger.warning_once(
-                f"You are using the default legacy behaviour of the {self.__class__}. This is"
-                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
-                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
-                " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
-                " you can ignore this message"
+        self.add_prefix_space = add_prefix_space if add_prefix_space is not None else True
+
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
             )
-            legacy = True
+        else:
+            self._vocab = {
+                str(unk_token): 0,
+                str(bos_token): 1,
+                str(eos_token): 2,
+            }
+
+        special_tokens = {str(eos_token), str(bos_token), str(unk_token)}
+
+        filtered_vocab = {t: i for t, i in self._vocab.items() if t not in special_tokens}
+        if merges is not None:
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = generate_merges(filtered_vocab)
+        self._tokenizer = Tokenizer(
+            BPE(vocab=self._vocab, merges=self._merges, fuse_unk=True, byte_fallback=True, dropout=None)
+        )
+        self._tokenizer.normalizer = None
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+            replacement="▁", prepend_scheme=_get_prepend_scheme(self.add_prefix_space, self), split=False
+        )
 
-        self.legacy = legacy
-        self.vocab_file = vocab_file
-        self.add_bos_token = add_bos_token
-        self.add_eos_token = add_eos_token
-        self.use_default_system_prompt = use_default_system_prompt
-        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
-        self.add_prefix_space = add_prefix_space
+        sequence = [
+            decoders.Replace("▁", " "),
+            decoders.ByteFallback(),
+            decoders.Fuse(),
+        ]
+
+        if self.add_prefix_space:
+            sequence += [decoders.Strip(content=" ", left=1)]
+
+        self._tokenizer.decoder = decoders.Sequence(sequence)
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
-            sp_model_kwargs=self.sp_model_kwargs,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             use_default_system_prompt=use_default_system_prompt,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-            legacy=legacy,
             add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    @property
-    def unk_token_length(self):
-        return len(self.sp_model.encode(str(self.unk_token)))
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
-    def get_spm_processor(self, from_slow=False):
-        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        if self.legacy or from_slow:  # no dependency on protobuf
-            tokenizer.Load(self.vocab_file)
-            return tokenizer
-
-        with open(self.vocab_file, "rb") as f:
-            sp_model = f.read()
-            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
-            model = model_pb2.ModelProto.FromString(sp_model)
-            normalizer_spec = model_pb2.NormalizerSpec()
-            normalizer_spec.add_dummy_prefix = False
-            model.normalizer_spec.MergeFrom(normalizer_spec)
-            sp_model = model.SerializeToString()
-            tokenizer.LoadFromSerializedProto(sp_model)
-        return tokenizer
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
-        first token is special.
-        """
-        if self.legacy or len(text) == 0:
-            return super().tokenize(text, **kwargs)
-
-        text = text.replace(SPIECE_UNDERLINE, " ")
-        if self.add_prefix_space:
-            text = SPIECE_UNDERLINE + text
-
-        tokens = super().tokenize(text, **kwargs)
-
-        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
-            tokens = tokens[1:]
-        return tokens
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
-    def _tokenize(self, text, **kwargs):
-        """
-        Returns a tokenized string.
-
-        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
-        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
-        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
-        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
-        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
-        """
-        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
-            return self.sp_model.encode(text, out_type=str)
-
-        # 1. Encode string + prefix ex: " Hey"
-        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
-        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        # since we manually add the prefix space, we have to remove it when decoding
-        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
-            tokens[0] = tokens[0][1:]
-
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special and i != 0 and self.legacy:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                if prev_is_special and i == 1 and self.add_prefix_space and not token.startswith(SPIECE_UNDERLINE):
-                    out_string += " "
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string
-
-    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> tuple[str]:
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        bos_token_id = [1] if self.add_bos_token else []
-        eos_token_id = [1] if self.add_eos_token else []
-
-        if token_ids_1 is None:
-            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        if token_ids_1 is None, only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        self._add_bos_token = add_bos_token
+        self._add_eos_token = add_eos_token
+        self.use_default_system_prompt = use_default_system_prompt
 
-        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        self._post_init()
 
-        if token_ids_1 is not None:
-            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+    def _post_init(self):
+        """Post-initialization setup that needs to run after _tokenizer is set."""
+        # Only set pre_tokenizer/normalizer for Llama-3 style tokenizers (use Sequence)
+        pre_tok = self._tokenizer.pre_tokenizer
+        if pre_tok is None or type(pre_tok).__name__ != "Sequence":
+            self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
+                replacement="▁", prepend_scheme="first", split=False
+            )
+            self._tokenizer.normalizer = None
+            self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
+        super()._post_init()
+        self.update_post_processor()
 
-        return output
 
+__all__ = ["LlamaTokenizer", "LlamaTokenizerFast"]
 
-__all__ = ["LlamaTokenizer"]
+# Backward alias
+LlamaTokenizerFast = LlamaTokenizer
diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py
deleted file mode 100644
index 212e65404e82..000000000000
--- a/src/transformers/models/llama/tokenization_llama_fast.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_llama import LlamaTokenizer
-else:
-    LlamaTokenizer = None
-
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"}
-
-B_INST, E_INST = "[INST]", "[/INST]"
-B_SYS, E_SYS = "<>\n", "\n<>\n\n"
-
-# fmt: off
-DEFAULT_SYSTEM_PROMPT = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your \
-answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure\
- that your responses are socially unbiased and positive in nature.
-
-If a question does not make any sense, or is not factually coherent, explain why instead of answering something not \
-correct. If you don't know the answer to a question, please don't share false information."""
-# fmt: on
-
-
-class LlamaTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a Llama tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    This uses notably ByteFallback and no normalization.
-
-    ```python
-    >>> from transformers import LlamaTokenizerFast
-
-    >>> tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")
-    >>> tokenizer.encode("Hello this is a test")
-    [1, 15043, 445, 338, 263, 1243]
-    ```
-
-    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
-    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
-    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
-    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.
-
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        tokenizer_file (`str`, *optional*):
-            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `""`):
-            The end of sequence token.
-        add_bos_token (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an `bos_token` at the start of sequences.
-        add_eos_token (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an `eos_token` at the end of sequences.
-        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
-            Whether or not the default system prompt for Llama should be used
-        legacy (`bool`, *optional*):
-            Whether or not the `legacy` behavior of the tokenizer should be used. Legacy is before the merge of #24622
-            and #25224 which includes fixes to properly handle tokens that appear after special tokens.
-            Make sure to also set `from_slow` to `True`.
-            A simple example:
-
-            - `legacy=True`:
-            ```python
-            >>> from transformers import LlamaTokenizerFast
-
-            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=True, from_slow=True)
-            >>> tokenizer.encode("Hello .") # 869 is '▁.'
-            [1, 15043, 29871, 1, 869]
-            ```
-            - `legacy=False`:
-            ```python
-            >>> from transformers import LlamaTokenizerFast
-
-            >>> tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
-            >>> tokenizer.encode("Hello .")  # 29889 is '.'
-            [1, 15043, 29871, 1, 29889]
-            ```
-            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
-        add_prefix_space (`bool`, *optional*):
-            Whether or not the tokenizer should automatically add a prefix space
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = LlamaTokenizer
-    padding_side = "left"
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        clean_up_tokenization_spaces=False,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        add_bos_token=True,
-        add_eos_token=False,
-        use_default_system_prompt=False,
-        legacy=None,
-        add_prefix_space=None,
-        **kwargs,
-    ):
-        if legacy is None:
-            logger.warning_once(
-                f"You are using the default legacy behaviour of the {self.__class__}. This is"
-                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
-                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
-                " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file"
-                " you can ignore this message."
-            )
-            legacy = True
-        self.legacy = legacy
-
-        if add_prefix_space is not None:
-            kwargs["from_slow"] = True
-
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_bos_token=add_bos_token,
-            add_eos_token=add_eos_token,
-            use_default_system_prompt=use_default_system_prompt,
-            add_prefix_space=add_prefix_space,
-            legacy=legacy,
-            **kwargs,
-        )
-        self._add_bos_token = add_bos_token
-        self._add_eos_token = add_eos_token
-        self.update_post_processor()
-        self.use_default_system_prompt = use_default_system_prompt
-        self.vocab_file = vocab_file
-
-    def update_post_processor(self):
-        """
-        Updates the underlying post processor with the current `bos_token` and `eos_token`.
-        """
-        bos = self.bos_token
-        bos_token_id = self.bos_token_id
-        if bos is None and self.add_bos_token:
-            raise ValueError("add_bos_token = True but bos_token = None")
-
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        if eos is None and self.add_eos_token:
-            raise ValueError("add_eos_token = True but eos_token = None")
-
-        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
-        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
-
-        special_tokens = []
-        if self.add_bos_token:
-            special_tokens.append((bos, bos_token_id))
-        if self.add_eos_token:
-            special_tokens.append((eos, eos_token_id))
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=single, pair=pair, special_tokens=special_tokens
-        )
-
-    @property
-    def add_eos_token(self):
-        return self._add_eos_token
-
-    @property
-    def add_bos_token(self):
-        return self._add_bos_token
-
-    @add_eos_token.setter
-    def add_eos_token(self, value):
-        self._add_eos_token = value
-        self.update_post_processor()
-
-    @add_bos_token.setter
-    def add_bos_token(self, value):
-        self._add_bos_token = value
-        self.update_post_processor()
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-    # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers
-    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
-        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
-
-        output = bos_token_id + token_ids_0 + eos_token_id
-
-        if token_ids_1 is not None:
-            output = output + bos_token_id + token_ids_1 + eos_token_id
-
-        return output
-
-
-__all__ = ["LlamaTokenizerFast"]
diff --git a/src/transformers/models/longformer/__init__.py b/src/transformers/models/longformer/__init__.py
index a0ee62087e76..1db4be60ed44 100644
--- a/src/transformers/models/longformer/__init__.py
+++ b/src/transformers/models/longformer/__init__.py
@@ -18,10 +18,9 @@
 
 
 if TYPE_CHECKING:
+    from ..roberta.tokenization_roberta import RobertaTokenizer as LongformerTokenizer
     from .configuration_longformer import *
     from .modeling_longformer import *
-    from .tokenization_longformer import *
-    from .tokenization_longformer_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
deleted file mode 100644
index 104bdd7a9b99..000000000000
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from functools import lru_cache
-from typing import Optional
-
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-
-@lru_cache
-# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.roberta.tokenization_roberta.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-# Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer with FacebookAI/roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, RobertaTokenizer->LongformerTokenizer
-class LongformerTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a Longformer tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import LongformerTokenizer
-
-    >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (Longformer tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
-
-        # these special tokens are not part of the vocab.json, let's add them in the correct order
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
-
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        vocab = dict(self.encoder).copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A Longformer sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Longformer does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
-
-__all__ = ["LongformerTokenizer"]
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
deleted file mode 100644
index bde6bb55fec6..000000000000
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Allen Institute for AI team and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for Longformer."""
-
-import json
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils_base import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_longformer import LongformerTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast with FacebookAI/roberta-base->allenai/longformer-base-4096, RoBERTa->Longformer all-casing, Roberta->Longformer
-class LongformerTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Longformer tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2
-    tokenizer, using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import LongformerTokenizerFast
-
-    >>> tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (Longformer tokenizer detect beginning of words by the preceding space).
-        trim_offsets (`bool`, *optional*, defaults to `True`):
-            Whether the post processing step should trim offsets to avoid including whitespaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = LongformerTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        trim_offsets=True,
-        **kwargs,
-    ):
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-            **kwargs,
-        )
-
-        tokenizer_component = "post_processor"
-        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
-        if tokenizer_component_instance:
-            state = json.loads(tokenizer_component_instance.__getstate__())
-
-            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
-            if "sep" in state:
-                state["sep"] = tuple(state["sep"])
-            if "cls" in state:
-                state["cls"] = tuple(state["cls"])
-
-            changes_to_apply = False
-
-            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-                state["add_prefix_space"] = add_prefix_space
-                changes_to_apply = True
-
-            if state.get("trim_offsets", trim_offsets) != trim_offsets:
-                state["trim_offsets"] = trim_offsets
-                changes_to_apply = True
-
-            if changes_to_apply:
-                component_class = getattr(processors, state.pop("type"))
-                new_value = component_class(**state)
-                setattr(self.backend_tokenizer, tokenizer_component, new_value)
-
-    @property
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-
-        Longformer tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the **.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @mask_token.setter
-    def mask_token(self, value):
-        """
-        Overriding the default behavior of the mask token to have it eat the space before it.
-
-        This is needed to preserve backward compatibility with all the previously used models based on Longformer.
-        """
-        # Mask token behave like a normal word, i.e. include the space before it
-        # So we set lstrip to True
-        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
-        self._mask_token = value
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. Longformer does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-
-__all__ = ["LongformerTokenizerFast"]
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 99d3fdff9b32..c1dd5be77b6f 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -16,15 +16,14 @@
 
 import itertools
 import json
-import os
 from collections.abc import Mapping
-from functools import lru_cache
 from typing import Optional, Union
 
 import numpy as np
-import regex as re
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     AddedToken,
@@ -37,6 +36,7 @@
     TruncationStrategy,
     to_py_obj,
 )
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import add_end_docstrings, is_torch_tensor, logging
 
 
@@ -130,48 +130,7 @@
 """
 
 
-@lru_cache
-# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.roberta.tokenization_roberta.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class LukeTokenizer(PreTrainedTokenizer):
+class LukeTokenizer(TokenizersBackend):
     """
     Constructs a LUKE tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
 
@@ -269,21 +228,10 @@ class LukeTokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
-        entity_vocab_file,
-        task=None,
-        max_entity_length=32,
-        max_mention_length=30,
-        entity_token_1="",
-        entity_token_2="",
-        entity_unk_token="[UNK]",
-        entity_pad_token="[PAD]",
-        entity_mask_token="[MASK]",
-        entity_mask2_token="[MASK2]",
         errors="replace",
         bos_token="",
         eos_token="",
@@ -293,62 +241,88 @@ def __init__(
         pad_token="",
         mask_token="",
         add_prefix_space=False,
+        task=None,
+        max_entity_length=32,
+        max_mention_length=30,
+        entity_token_1="",
+        entity_token_2="",
+        entity_unk_token="[UNK]",
+        entity_pad_token="[PAD]",
+        entity_mask_token="[MASK]",
+        entity_mask2_token="[MASK2]",
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
+        entity_vocab: Optional[dict] = None,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
         self.add_prefix_space = add_prefix_space
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        # Handle entity vocab file for backward compatibility
+        entity_vocab_file = kwargs.pop("entity_vocab_file", None)
+
+        # Check if vocab/merges/entity_vocab are in kwargs
+        if vocab is None and "vocab" in kwargs:
+            vocab = kwargs.pop("vocab")
+        if merges is None and "merges" in kwargs:
+            merges = kwargs.pop("merges")
+        if entity_vocab is None and "entity_vocab" in kwargs:
+            entity_vocab = kwargs.pop("entity_vocab")
+
+        # Build vocab and merges (either from data or empty, like GPT2Tokenizer)
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {}
 
-        # we add 2 special tokens for downstream tasks
-        # for more information about lstrip and rstrip, see https://github.com/huggingface/transformers/pull/2778
-        entity_token_1 = (
-            AddedToken(entity_token_1, lstrip=False, rstrip=False)
-            if isinstance(entity_token_1, str)
-            else entity_token_1
-        )
-        entity_token_2 = (
-            AddedToken(entity_token_2, lstrip=False, rstrip=False)
-            if isinstance(entity_token_2, str)
-            else entity_token_2
+        if merges is not None:
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
         )
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", [])
-        kwargs["additional_special_tokens"] += [entity_token_1, entity_token_2]
 
-        with open(entity_vocab_file, encoding="utf-8") as entity_vocab_handle:
-            self.entity_vocab = json.load(entity_vocab_handle)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        # Load entity vocab
+        if entity_vocab is not None:
+            self.entity_vocab = entity_vocab
+        elif entity_vocab_file is not None:
+            with open(entity_vocab_file, encoding="utf-8") as f:
+                self.entity_vocab = json.load(f)
+        else:
+            # If no entity vocab provided, create a minimal one with required special tokens
+            self.entity_vocab = {
+                entity_unk_token: 0,
+                entity_pad_token: 1,
+                entity_mask_token: 2,
+                entity_mask2_token: 3,
+            }
+
+        # Validate entity special tokens
         for entity_special_token in [entity_unk_token, entity_pad_token, entity_mask_token, entity_mask2_token]:
             if entity_special_token not in self.entity_vocab:
                 raise ValueError(
-                    f"Specified entity special token ``{entity_special_token}`` is not found in entity_vocab. "
-                    f"Probably an incorrect entity vocab file is loaded: {entity_vocab_file}."
+                    f"Specified entity special token `{entity_special_token}` is not found in entity_vocab."
                 )
+
         self.entity_unk_token_id = self.entity_vocab[entity_unk_token]
         self.entity_pad_token_id = self.entity_vocab[entity_pad_token]
         self.entity_mask_token_id = self.entity_vocab[entity_mask_token]
         self.entity_mask2_token_id = self.entity_vocab[entity_mask2_token]
 
+        # Setup task and max_entity_length
         self.task = task
         if task is None or task == "entity_span_classification":
             self.max_entity_length = max_entity_length
@@ -364,7 +338,48 @@ def __init__(
 
         self.max_mention_length = max_mention_length
 
+        # Add entity tokens to extra_special_tokens
+        entity_token_1 = (
+            AddedToken(entity_token_1, lstrip=False, rstrip=False)
+            if isinstance(entity_token_1, str)
+            else entity_token_1
+        )
+        entity_token_2 = (
+            AddedToken(entity_token_2, lstrip=False, rstrip=False)
+            if isinstance(entity_token_2, str)
+            else entity_token_2
+        )
+        # Handle extra/legacy special tokens (v4 hub files compat)
+        extra_tokens: list[AddedToken | str] = []
+        for key in ("extra_special_tokens", "additional_special_tokens"):
+            for token in kwargs.pop(key, []) or []:
+                extra_tokens.append(AddedToken(**token) if isinstance(token, dict) else token)
+
+        # Ensure LUKE entity tokens are present exactly once.
+        seen = {str(token) for token in extra_tokens}
+        for token in (entity_token_1, entity_token_2):
+            token_str = str(token)
+            if token_str not in seen:
+                extra_tokens.append(token)
+                seen.add(token_str)
+
+        kwargs["extra_special_tokens"] = extra_tokens
+
+        tokenizer_object = self._tokenizer
+
+        # Configure default special token behaviors to match LUKE formatting
+        token_type_ids_pattern = kwargs.setdefault("token_type_ids_pattern", "all_zeros")
+        special_tokens_pattern = kwargs.setdefault("special_tokens_pattern", "cls_double_sep")
+        token_type_ids_include_special_tokens = kwargs.setdefault("token_type_ids_include_special_tokens", True)
+        self.token_type_ids_pattern = token_type_ids_pattern
+        self.special_tokens_pattern = special_tokens_pattern
+        self.token_type_ids_include_special_tokens = token_type_ids_include_special_tokens
+
+        # Set clean_up_tokenization_spaces=True by default to match old Python tokenizer behavior
+        kwargs.setdefault("clean_up_tokenization_spaces", True)
+
         super().__init__(
+            tokenizer_object=tokenizer_object,
             errors=errors,
             bos_token=bos_token,
             eos_token=eos_token,
@@ -375,183 +390,77 @@ def __init__(
             mask_token=mask_token,
             add_prefix_space=add_prefix_space,
             task=task,
-            max_entity_length=32,
-            max_mention_length=30,
-            entity_token_1="",
-            entity_token_2="",
+            max_entity_length=max_entity_length,
+            max_mention_length=max_mention_length,
+            entity_token_1=str(entity_token_1),
+            entity_token_2=str(entity_token_2),
             entity_unk_token=entity_unk_token,
             entity_pad_token=entity_pad_token,
             entity_mask_token=entity_mask_token,
             entity_mask2_token=entity_mask2_token,
+            entity_vocab=entity_vocab if entity_vocab_file is None else None,  # Only store if it was passed as data
             **kwargs,
         )
+        self._post_init()
+
+    def _post_init(self):
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.cls_token}:0 $A:0 {self.sep_token}:0",
+            pair=f"{self.cls_token}:0 $A:0 {self.sep_token}:0 {self.sep_token}:0 $B:1 {self.sep_token}:1",
+            special_tokens=[
+                (self.cls_token, self.cls_token_id),
+                (self.sep_token, self.sep_token_id),
+            ],
+        )
 
-    @property
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Luke, RoBERTa->LUKE
-    def vocab_size(self):
-        return len(self.encoder)
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Luke, RoBERTa->LUKE
-    def get_vocab(self):
-        vocab = dict(self.encoder).copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Luke, RoBERTa->LUKE
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize with Roberta->Luke, RoBERTa->LUKE
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id with Roberta->Luke, RoBERTa->LUKE
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token with Roberta->Luke, RoBERTa->LUKE
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string with Roberta->Luke, RoBERTa->LUKE
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.build_inputs_with_special_tokens with Roberta->Luke, RoBERTa->LUKE
     def build_inputs_with_special_tokens(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A LUKE sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
+        return PreTrainedTokenizer.build_inputs_with_special_tokens(self, token_ids_0, token_ids_1)
 
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Luke, RoBERTa->LUKE
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
     ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return PreTrainedTokenizer.get_special_tokens_mask(
+            self, token_ids_0, token_ids_1, already_has_special_tokens=already_has_special_tokens
+        )
 
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Luke, RoBERTa->LUKE
     def create_token_type_ids_from_sequences(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. LUKE does not
-        make use of token type ids, therefore a list of zeros is returned.
+        return PreTrainedTokenizer.create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1)
 
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
+    def _decode(
+        self,
+        token_ids: Union[int, list[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
+        text = super()._decode(
+            token_ids, skip_special_tokens=skip_special_tokens, clean_up_tokenization_spaces=False, **kwargs
+        )
 
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            text = (
+                text.replace(" .", ".")
+                .replace(" ?", "?")
+                .replace(" !", "!")
+                .replace(" ,", ",")
+                .replace(" ' ", "'")
+                .replace(" n't", "n't")
+                .replace(" 'm", "'m")
+                .replace(" 's", "'s")
+                .replace(" 've", "'ve")
+                .replace(" 're", "'re")
+            )
 
-    # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization with Roberta->Luke, RoBERTa->LUKE
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
+        return text
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
@@ -581,6 +490,35 @@ def __call__(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
+        # Check for seq2seq parameters that are not supported with entity-aware encoding
+        if kwargs.get("text_target") is not None or kwargs.get("text_pair_target") is not None:
+            if entity_spans is not None or entities is not None or self.task is not None:
+                raise NotImplementedError(
+                    "text_target and text_pair_target are not supported when using entity-aware encoding. "
+                    "Please use the tokenizer without entities for seq2seq tasks."
+                )
+            # Delegate to parent for seq2seq encoding
+            return super().__call__(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
         sequences, depending on the task you want to prepare them for.
@@ -622,9 +560,13 @@ def __call__(
         """
         # Input type checking for clearer error
         is_valid_single_text = isinstance(text, str)
-        is_valid_batch_text = isinstance(text, (list, tuple)) and (len(text) == 0 or (isinstance(text[0], str)))
+        is_valid_batch_text = isinstance(text, (list, tuple)) and (
+            len(text) == 0 or isinstance(text[0], (str, list, tuple))
+        )
         if not (is_valid_single_text or is_valid_batch_text):
-            raise ValueError("text input must be of type `str` (single example) or `list[str]` (batch).")
+            raise ValueError(
+                "text input must be of type `str` (single example), `list[str]` (batch), or `list[tuple]` (batch pairs)."
+            )
 
         is_valid_single_text_pair = isinstance(text_pair, str)
         is_valid_batch_text_pair = isinstance(text_pair, (list, tuple)) and (
@@ -635,6 +577,16 @@ def __call__(
 
         is_batched = bool(isinstance(text, (list, tuple)))
 
+        # Convert padding and truncation to strategies
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
         if is_batched:
             batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
             if entities is None:
@@ -651,13 +603,13 @@ def __call__(
                     list(zip(entity_spans, entity_spans_pair)) if entity_spans_pair is not None else entity_spans
                 )
 
-            return self.batch_encode_plus(
+            return self._batch_encode_plus(
                 batch_text_or_text_pairs=batch_text_or_text_pairs,
                 batch_entity_spans_or_entity_spans_pairs=batch_entity_spans_or_entity_spans_pairs,
                 batch_entities_or_entities_pairs=batch_entities_or_entities_pairs,
                 add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
                 max_length=max_length,
                 max_entity_length=max_entity_length,
                 stride=stride,
@@ -675,7 +627,7 @@ def __call__(
                 **kwargs,
             )
         else:
-            return self.encode_plus(
+            return self._encode_plus(
                 text=text,
                 text_pair=text_pair,
                 entity_spans=entity_spans,
@@ -683,8 +635,8 @@ def __call__(
                 entities=entities,
                 entities_pair=entities_pair,
                 add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
                 max_length=max_length,
                 max_entity_length=max_entity_length,
                 stride=stride,
@@ -729,15 +681,40 @@ def _encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
+        # If no entities are provided and task doesn't require them, delegate to parent for proper Encoding support
+        if (
+            entity_spans is None
+            and entity_spans_pair is None
+            and entities is None
+            and entities_pair is None
+            and self.task is None
+        ):
+            # Delegate to parent TokenizersBackend which properly handles Encoding objects
+            return super()._encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
             )
 
+        if return_offsets_mapping:
+            raise NotImplementedError("return_offset_mapping is not available when using entity-aware encoding.")
+
         if is_split_into_words:
             raise NotImplementedError("is_split_into_words is not supported in this tokenizer.")
 
@@ -812,13 +789,52 @@ def _batch_encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
+        # If no entities are provided and task doesn't require them, delegate to parent for proper Encoding support
+        if (
+            batch_entity_spans_or_entity_spans_pairs is None
+            and batch_entities_or_entities_pairs is None
+            and self.task is None
+        ):
+            # Parent's _encode_plus handles batching internally, so we reconstruct text/text_pair
+            # from batch_text_or_text_pairs and pass to parent's _encode_plus
+            # Detect if we have pairs
+            if batch_text_or_text_pairs and isinstance(batch_text_or_text_pairs[0], (tuple, list)):
+                # We have pairs
+                texts, text_pairs = zip(*batch_text_or_text_pairs)
+                texts = list(texts)
+                text_pairs = list(text_pairs)
+            else:
+                # Just texts
+                texts = batch_text_or_text_pairs
+                text_pairs = None
+
+            # Delegate to parent TokenizersBackend which properly handles Encoding objects for batches
+            return super()._encode_plus(
+                text=texts,
+                text_pair=text_pairs,
+                add_special_tokens=add_special_tokens,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                split_special_tokens=kwargs.get("split_special_tokens", self.split_special_tokens),
+                **kwargs,
             )
 
+        if return_offsets_mapping:
+            raise NotImplementedError("return_offset_mapping is not available when using entity-aware encoding.")
+
         if is_split_into_words:
             raise NotImplementedError("is_split_into_words is not supported in this tokenizer.")
 
@@ -923,8 +939,9 @@ def _create_input_sequence(
         **kwargs,
     ) -> tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
-            tokens = self.tokenize(text, **kwargs)
-            return self.convert_tokens_to_ids(tokens)
+            # Use the underlying tokenizer directly to avoid recursion
+            encoding = self._tokenizer.encode(text, add_special_tokens=False)
+            return encoding.ids
 
         def get_input_ids_and_entity_token_spans(text, entity_spans):
             if entity_spans is None:
@@ -1000,12 +1017,10 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
             # add special tokens to input ids
             entity_token_start, entity_token_end = first_entity_token_spans[0]
             first_ids = (
-                first_ids[:entity_token_end] + [self.additional_special_tokens_ids[0]] + first_ids[entity_token_end:]
+                first_ids[:entity_token_end] + [self.extra_special_tokens_ids[0]] + first_ids[entity_token_end:]
             )
             first_ids = (
-                first_ids[:entity_token_start]
-                + [self.additional_special_tokens_ids[0]]
-                + first_ids[entity_token_start:]
+                first_ids[:entity_token_start] + [self.extra_special_tokens_ids[0]] + first_ids[entity_token_start:]
             )
             first_entity_token_spans = [(entity_token_start, entity_token_end + 2)]
 
@@ -1027,8 +1042,8 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
 
             head_token_span, tail_token_span = first_entity_token_spans
             token_span_with_special_token_ids = [
-                (head_token_span, self.additional_special_tokens_ids[0]),
-                (tail_token_span, self.additional_special_tokens_ids[1]),
+                (head_token_span, self.extra_special_tokens_ids[0]),
+                (tail_token_span, self.extra_special_tokens_ids[1]),
             ]
             if head_token_span[0] < tail_token_span[0]:
                 first_entity_token_spans[0] = (head_token_span[0], head_token_span[1] + 2)
@@ -1689,41 +1704,5 @@ def _pad(
 
         return encoded_inputs
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        entity_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
-        )
-
-        with open(entity_vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        return vocab_file, merge_file, entity_vocab_file
-
 
 __all__ = ["LukeTokenizer"]
diff --git a/src/transformers/models/lxmert/__init__.py b/src/transformers/models/lxmert/__init__.py
index 8cde45820316..e8370a12eaf6 100644
--- a/src/transformers/models/lxmert/__init__.py
+++ b/src/transformers/models/lxmert/__init__.py
@@ -18,10 +18,9 @@
 
 
 if TYPE_CHECKING:
+    from ..bert.tokenization_bert import BertTokenizer as LxmertTokenizer
     from .configuration_lxmert import *
     from .modeling_lxmert import *
-    from .tokenization_lxmert import *
-    from .tokenization_lxmert_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
deleted file mode 100644
index dd1d7e205ea5..000000000000
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ /dev/null
@@ -1,482 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import os
-import unicodedata
-from typing import Optional
-
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with bert-base-cased->unc-nlp/lxmert-base-uncased, BERT->Lxmert, BertTokenizer->LxmertTokenizer
-class LxmertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a Lxmert tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original Lxmert).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = LxmertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A Lxmert sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-__all__ = ["LxmertTokenizer"]
diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
deleted file mode 100644
index fcfa3263acda..000000000000
--- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from .tokenization_lxmert import LxmertTokenizer
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with bert-base-cased->unc-nlp/lxmert-base-uncased, BERT->Lxmert, Bert->Lxmert
-class LxmertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" Lxmert tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original Lxmert).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = LxmertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A Lxmert sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["LxmertTokenizerFast"]
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index ccced10f2bac..5bbdd4575b7d 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -21,7 +21,7 @@
 
 import sentencepiece
 
-from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
+from ...tokenization_python import BatchEncoding, PreTrainedTokenizer
 from ...utils import logging
 from ...utils.import_utils import requires
 
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 66a3630ffd56..68f22fecffc7 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -20,7 +20,7 @@
 
 import sentencepiece
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 from ...utils.import_utils import requires
 
@@ -128,7 +128,6 @@ def __init__(
         self.encoder = load_json(vocab)
         if str(unk_token) not in self.encoder:
             raise KeyError(" token must be in the vocab")
-        assert str(pad_token) in self.encoder
 
         if separate_vocabs:
             self.target_encoder = load_json(target_vocab_file)
@@ -152,6 +151,8 @@ def __init__(
 
         self._setup_normalizer()
 
+        self._decode_use_source_tokenizer = False
+
         super().__init__(
             # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
             source_lang=source_lang,
@@ -180,7 +181,11 @@ def normalize(self, x: str) -> str:
         return self.punc_normalizer(x) if x else ""
 
     def _convert_token_to_id(self, token):
-        return self.current_encoder.get(token, self.current_encoder[self.unk_token])
+        if token in self.current_encoder:
+            return self.current_encoder[token]
+        # The Marian vocab is not aligned with the SentencePiece IDs, so falling back to raw
+        # SentencePiece indices would map to unrelated tokens. Treat such pieces as unknown.
+        return self.current_encoder[self.unk_token]
 
     def remove_language_code(self, text: str):
         """Remove language codes like >>fr<< before sentencepiece"""
@@ -197,7 +202,12 @@ def _tokenize(self, text: str) -> list[str]:
 
     def _convert_id_to_token(self, index: int) -> str:
         """Converts an index (integer) in a token (str) using the decoder."""
-        return self.decoder.get(index, self.unk_token)
+        if index in self.decoder:
+            return self.decoder[index]
+        # Fall back to SPM model for IDs not in external vocab
+        spm_model = self.spm_source if self._decode_use_source_tokenizer else self.spm_target
+        piece = spm_model.IdToPiece(index)
+        return piece if piece else self.unk_token
 
     def batch_decode(self, sequences, **kwargs):
         """
@@ -248,6 +258,23 @@ def decode(self, token_ids, **kwargs):
         """
         return super().decode(token_ids, **kwargs)
 
+    def _decode(
+        self,
+        token_ids,
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        **kwargs,
+    ) -> str:
+        """Internal decode method that handles use_source_tokenizer parameter."""
+        default_use_source = not self.separate_vocabs
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", default_use_source)
+        return super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
         """Uses source spm if _decode_use_source_tokenizer is True, and target spm otherwise"""
         sp_model = self.spm_source if self._decode_use_source_tokenizer else self.spm_target
@@ -351,6 +378,8 @@ def __setstate__(self, d: dict) -> None:
         # for backward compatibility
         if not hasattr(self, "sp_model_kwargs"):
             self.sp_model_kwargs = {}
+        if not hasattr(self, "_decode_use_source_tokenizer"):
+            self._decode_use_source_tokenizer = False
 
         self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files)
         self.current_spm = self.spm_source
diff --git a/src/transformers/models/markuplm/__init__.py b/src/transformers/models/markuplm/__init__.py
index 8b03aa2e6256..ca084acef41b 100644
--- a/src/transformers/models/markuplm/__init__.py
+++ b/src/transformers/models/markuplm/__init__.py
@@ -22,8 +22,6 @@
     from .feature_extraction_markuplm import *
     from .modeling_markuplm import *
     from .processing_markuplm import *
-    from .tokenization_markuplm import *
-    from .tokenization_markuplm_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/markuplm/tokenization_markuplm.py b/src/transformers/models/markuplm/tokenization_markuplm.py
index 0a6f7c3bd6a0..597008c3a97e 100644
--- a/src/transformers/models/markuplm/tokenization_markuplm.py
+++ b/src/transformers/models/markuplm/tokenization_markuplm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,27 +12,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization class for MarkupLM."""
 
-import json
-import os
-from functools import lru_cache
 from typing import Optional, Union
 
-import regex as re
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
+    AddedToken,
     BatchEncoding,
     EncodedInput,
+    PaddingStrategy,
     PreTokenizedInput,
+    TensorType,
     TextInput,
     TextInputPair,
     TruncationStrategy,
 )
-from ...utils import logging
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import add_end_docstrings, logging
 
 
 logger = logging.get_logger(__name__)
@@ -43,7 +42,7 @@
 MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+            padding (`bool`, `str` or [`~tokenization_utils_base.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
                 - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
@@ -77,10 +76,13 @@
                 `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
                 returned to provide some overlap between truncated and overflowing sequences. The value of this
                 argument defines the number of overlapping tokens.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pretokenized (e.g. split into words). Set this to `True` if you are
+                passing pretokenized inputs to avoid additional tokenization.
             pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+            return_tensors (`str` or [`~tokenization_utils_base.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
@@ -88,47 +90,14 @@
 """
 
 
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
-    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
-    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
-    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
-    strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
+class MarkupLMTokenizer(TokenizersBackend):
+    r"""
+    Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
 
+    [`MarkupLMTokenizer`] can be used to turn HTML strings into to token-level `input_ids`, `attention_mask`,
+    `token_type_ids`, `xpath_tags_seq` and `xpath_tags_seq`. This tokenizer inherits from [`TokenizersBackend`] which
+    contains most of the main methods and ensures a `tokenizers` backend is always instantiated.
 
-class MarkupLMTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE). [`MarkupLMTokenizer`] can be used to
-    turn HTML strings into to token-level `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and
-    `xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
@@ -183,9 +152,9 @@ class MarkupLMTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
         tags_dict,
+        vocab: Optional[Union[dict, list]] = None,
+        merges: Optional[list] = None,
         errors="replace",
         bos_token="",
         eos_token="",
@@ -200,8 +169,14 @@ def __init__(
         pad_width=1001,
         pad_token_label=-100,
         only_label_first_subword=True,
+        trim_offsets=False,
         **kwargs,
     ):
+        if kwargs.get("from_slow"):
+            logger.warning(
+                "MarkupLMTokenizer no longer supports initialization from a slow tokenizer. Ignoring `from_slow=True`."
+            )
+        kwargs["from_slow"] = False
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
@@ -212,37 +187,65 @@ def __init__(
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-
-        self.tags_dict = tags_dict
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
-
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-        # additional properties
-        self.max_depth = max_depth
-        self.max_width = max_width
-        self.pad_width = pad_width
-        self.unk_tag_id = len(self.tags_dict)
-        self.pad_tag_id = self.unk_tag_id + 1
-        self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
-        self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
+        processed_vocab = vocab
+        processed_merges = merges
+
+        if isinstance(processed_vocab, list):
+            processed_vocab = {
+                token: index for index, (token, _score) in enumerate(processed_vocab) if isinstance(token, str)
+            }
+        elif isinstance(processed_vocab, dict):
+            processed_vocab = {str(token): int(index) for token, index in processed_vocab.items()}
+
+        if processed_vocab is None:
+            processed_vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+            }
+
+        normalized_merges = []
+        if processed_merges is not None:
+            for merge in processed_merges:
+                if isinstance(merge, tuple) and len(merge) == 2:
+                    normalized_merges.append((merge[0], merge[1]))
+                elif isinstance(merge, list) and len(merge) == 2:
+                    normalized_merges.append((merge[0], merge[1]))
+                elif isinstance(merge, str):
+                    parts = merge.split()
+                    if len(parts) == 2 and not merge.startswith("#"):
+                        normalized_merges.append((parts[0], parts[1]))
+        processed_merges = normalized_merges if normalized_merges else []
+
+        tokenizer = Tokenizer(
+            BPE(
+                vocab=processed_vocab,
+                merges=processed_merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel()
+
+        sep_token_str = str(sep_token)
+        cls_token_str = str(cls_token)
+        tokenizer.post_processor = processors.RobertaProcessing(
+            sep=(sep_token_str, processed_vocab.get(sep_token_str, processed_vocab.get("", 2))),
+            cls=(cls_token_str, processed_vocab.get(cls_token_str, processed_vocab.get("", 0))),
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+        )
 
         super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
+            tokenizer_object=tokenizer,
             tags_dict=tags_dict,
+            vocab=vocab,
+            merges=merges,
             errors=errors,
             bos_token=bos_token,
             eos_token=eos_token,
@@ -252,6 +255,7 @@ def __init__(
             pad_token=pad_token,
             mask_token=mask_token,
             add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
             max_depth=max_depth,
             max_width=max_width,
             pad_width=pad_width,
@@ -259,7 +263,25 @@ def __init__(
             only_label_first_subword=only_label_first_subword,
             **kwargs,
         )
+        if trim_offsets:
+            # Not implemented yet, because we need to chain two post processors which is not possible yet
+            # We need to wait for https://github.com/huggingface/tokenizers/pull/1005
+            # With `trim_offsets=False` we don't need to do add `processors.ByteLevel(trim_offsets=False)`
+            # because it's not doing anything
+            raise NotImplementedError(
+                "`trim_offsets=True` is not implemented for MarkupLMTokenizer. Please set it to False."
+            )
 
+        self.tags_dict = tags_dict
+
+        # additional properties
+        self.max_depth = max_depth
+        self.max_width = max_width
+        self.pad_width = pad_width
+        self.unk_tag_id = len(self.tags_dict)
+        self.pad_tag_id = self.unk_tag_id + 1
+        self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
+        self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
         self.pad_token_label = pad_token_label
         self.only_label_first_subword = only_label_first_subword
 
@@ -288,206 +310,6 @@ def get_xpath_seq(self, xpath):
 
         return xpath_tags_list, xpath_subs_list
 
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        vocab = self.encoder.copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        logger.warning(
-            "MarkupLM now does not support generative tasks, decoding is experimental and subject to change."
-        )
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        # save vocab_file
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        # save merge_file
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A RoBERTa sequence has the following format:
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def build_xpath_tags_with_special_tokens(
-        self, xpath_tags_0: list[int], xpath_tags_1: Optional[list[int]] = None
-    ) -> list[int]:
-        pad = [self.pad_xpath_tags_seq]
-        if len(xpath_tags_1) == 0:
-            return pad + xpath_tags_0 + pad
-        return pad + xpath_tags_0 + pad + xpath_tags_1 + pad
-
-    def build_xpath_subs_with_special_tokens(
-        self, xpath_subs_0: list[int], xpath_subs_1: Optional[list[int]] = None
-    ) -> list[int]:
-        pad = [self.pad_xpath_subs_seq]
-        if len(xpath_subs_1) == 0:
-            return pad + xpath_subs_0 + pad
-        return pad + xpath_subs_0 + pad + xpath_subs_1 + pad
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Args:
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        make use of token type ids, therefore a list of zeros is returned.
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]
-
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
@@ -500,6 +322,7 @@ def __call__(
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
+        is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
@@ -514,42 +337,71 @@ def __call__(
     ) -> BatchEncoding:
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with node-level xpaths and optional labels.
+        sequences with nodes, xpaths and optional labels.
 
         Args:
             text (`str`, `list[str]`, `list[list[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (nodes of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                nodes).
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
             text_pair (`list[str]`, `list[list[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
                 (pretokenized string).
             xpaths (`list[list[int]]`, `list[list[list[int]]]`):
-                Node-level xpaths.
+                Node-level xpaths. Each bounding box should be normalized to be on a 0-1000 scale.
             node_labels (`list[int]`, `list[list[int]]`, *optional*):
                 Node-level integer labels (for token classification tasks).
+            is_split_into_words (`bool`, *optional*):
+                Set to `True` if the inputs are already provided as pretokenized word lists.
         """
 
-        # Input type checking for clearer error
+        placeholder_xpath = "/document/node"
+
+        if isinstance(text, tuple):
+            text = list(text)
+        if text_pair is not None and isinstance(text_pair, tuple):
+            text_pair = list(text_pair)
+
+        if xpaths is None and not is_split_into_words:
+            nodes_source = text if text_pair is None else text_pair
+            if isinstance(nodes_source, tuple):
+                nodes_source = list(nodes_source)
+            processed_nodes = nodes_source
+
+            if isinstance(nodes_source, str):
+                processed_nodes = nodes_source.split()
+            elif isinstance(nodes_source, list):
+                if nodes_source and isinstance(nodes_source[0], str):
+                    requires_split = any(" " in entry for entry in nodes_source)
+                    if requires_split:
+                        processed_nodes = [entry.split() for entry in nodes_source]
+                    else:
+                        processed_nodes = nodes_source
+                elif nodes_source and isinstance(nodes_source[0], tuple):
+                    processed_nodes = [list(sample) for sample in nodes_source]
+
+            if text_pair is None:
+                text = processed_nodes
+            else:
+                text_pair = processed_nodes
+
+            if isinstance(processed_nodes, list) and processed_nodes and isinstance(processed_nodes[0], (list, tuple)):
+                xpaths = [[placeholder_xpath] * len(sample) for sample in processed_nodes]
+            else:
+                length = len(processed_nodes) if hasattr(processed_nodes, "__len__") else 0
+                xpaths = [placeholder_xpath] * length
+
         def _is_valid_text_input(t):
             if isinstance(t, str):
-                # Strings are fine
                 return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
+            if isinstance(t, (list, tuple)):
                 if len(t) == 0:
-                    # ... empty
                     return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
+                if isinstance(t[0], str):
                     return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
+                if isinstance(t[0], (list, tuple)):
                     return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
+            return False
 
         if text_pair is not None:
             # in case text + text_pair are provided, text = questions, text_pair = nodes
@@ -560,6 +412,7 @@ def _is_valid_text_input(t):
                     "Nodes must be of type `list[str]` (single pretokenized example), "
                     "or `list[list[str]]` (batch of pretokenized examples)."
                 )
+            is_batched = isinstance(text, (list, tuple))
         else:
             # in case only text is provided => must be nodes
             if not isinstance(text, (list, tuple)):
@@ -567,10 +420,6 @@ def _is_valid_text_input(t):
                     "Nodes must be of type `list[str]` (single pretokenized example), "
                     "or `list[list[str]]` (batch of pretokenized examples)."
                 )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
             is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
 
         nodes = text if text_pair is None else text_pair
@@ -697,185 +546,13 @@ def batch_encode_plus(
             **kwargs,
         )
 
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        xpaths: Optional[list[list[list[int]]]] = None,
-        node_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
-
-        batch_outputs = self._batch_prepare_for_model(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            xpaths=xpaths,
-            node_labels=node_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
-        )
-
-        return BatchEncoding(batch_outputs)
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def _batch_prepare_for_model(
-        self,
-        batch_text_or_text_pairs,
-        is_pair: Optional[bool] = None,
-        xpaths: Optional[list[list[int]]] = None,
-        node_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens.
-
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
-
-        batch_outputs = {}
-        for idx, example in enumerate(zip(batch_text_or_text_pairs, xpaths)):
-            batch_text_or_text_pair, xpaths_example = example
-            outputs = self.prepare_for_model(
-                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
-                batch_text_or_text_pair[1] if is_pair else None,
-                xpaths_example,
-                node_labels=node_labels[idx] if node_labels is not None else None,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
-                return_token_type_ids=return_token_type_ids,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
-                verbose=verbose,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-        return batch_outputs
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
-    def encode(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        xpaths: Optional[list[list[int]]] = None,
-        node_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> list[int]:
-        encoded_inputs = self.encode_plus(
-            text=text,
-            text_pair=text_pair,
-            xpaths=xpaths,
-            node_labels=node_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
         )
 
-        return encoded_inputs["input_ids"]
+        return encodings[0].tokens
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def encode_plus(
@@ -909,8 +586,8 @@ def encode_plus(
             text (`str`, `list[str]`, `list[list[str]]`):
                 The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
             text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (nodes of a single example) or a
-                list of list of strings (nodes of a batch of examples).
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
         """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -946,12 +623,16 @@ def encode_plus(
             **kwargs,
         )
 
-    def _encode_plus(
+    def _batch_encode_plus(
         self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        xpaths: Optional[list[list[int]]] = None,
-        node_labels: Optional[list[int]] = None,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        xpaths: Optional[list[list[list[int]]]] = None,
+        node_labels: Optional[list[list[int]]] = None,
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
@@ -959,7 +640,7 @@ def _encode_plus(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -967,54 +648,174 @@ def _encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
-        return self.prepare_for_model(
-            text=text,
-            text_pair=text_pair,
-            xpaths=xpaths,
-            node_labels=node_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
-            return_tensors=return_tensors,
-            prepend_batch_axis=True,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            verbose=verbose,
         )
 
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def prepare_for_model(
+        if is_pair:
+            processed_inputs = []
+            for text, text_pair in batch_text_or_text_pairs:
+                if isinstance(text, tuple):
+                    text = list(text)
+                if isinstance(text, str):
+                    text = [text]
+                if isinstance(text_pair, tuple):
+                    text_pair = list(text_pair)
+                if isinstance(text_pair, str):
+                    text_pair = [text_pair]
+                processed_inputs.append((text, text_pair))
+            batch_text_or_text_pairs = processed_inputs
+        else:
+            processed_inputs = []
+            for text in batch_text_or_text_pairs:
+                if isinstance(text, tuple):
+                    text = list(text)
+                if isinstance(text, str):
+                    text = [text]
+                processed_inputs.append(text)
+            batch_text_or_text_pairs = processed_inputs
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as MarkupLM always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` is a tuple of (list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #  list[EncodingFast]) with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if node_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token-level xpaths tags and subscripts
+        xpath_tags_seq = []
+        xpath_subs_seq = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            xpath_tags_seq_example = []
+            xpath_subs_seq_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        xpath_tags_seq_example.append(self.pad_xpath_tags_seq)
+                        xpath_subs_seq_example.append(self.pad_xpath_subs_seq)
+                    else:
+                        xpath_tags_list, xpath_subs_list = self.get_xpath_seq(xpaths[original_index][word_id])
+                        xpath_tags_seq_example.extend([xpath_tags_list])
+                        xpath_subs_seq_example.extend([xpath_subs_list])
+                else:
+                    if id in [self.cls_token_id, self.sep_token_id, self.pad_token_id]:
+                        xpath_tags_seq_example.append(self.pad_xpath_tags_seq)
+                        xpath_subs_seq_example.append(self.pad_xpath_subs_seq)
+                    else:
+                        raise ValueError("Id not recognized")
+            xpath_tags_seq.append(xpath_tags_seq_example)
+            xpath_subs_seq.append(xpath_subs_seq_example)
+
+        sanitized_tokens["xpath_tags_seq"] = xpath_tags_seq
+        sanitized_tokens["xpath_subs_seq"] = xpath_subs_seq
+
+        # optionally, create the labels
+        if node_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(node_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(node_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
         self,
         text: Union[TextInput, PreTokenizedInput],
         text_pair: Optional[PreTokenizedInput] = None,
         xpaths: Optional[list[list[int]]] = None,
         node_labels: Optional[list[int]] = None,
         add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1022,350 +823,76 @@ def prepare_for_model(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        prepend_batch_axis: bool = False,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
-        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
-        combination of arguments will raise an error.
+        placeholder_xpath = "/document/node"
 
-        Node-level `xpaths` are turned into token-level `xpath_tags_seq` and `xpath_subs_seq`. If provided, node-level
-        `node_labels` are turned into token-level `labels`. The node label is used for the first token of the node,
-        while remaining tokens are labeled with -100, such that they will be ignored by the loss function.
+        if isinstance(text, tuple):
+            text = list(text)
+        if text_pair is not None and isinstance(text_pair, tuple):
+            text_pair = list(text_pair)
 
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (nodes of a single example) or a
-                list of list of strings (nodes of a batch of examples).
-        """
+        nodes_single = text if text_pair is None else text_pair
+        processed_nodes = nodes_single
 
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
+        if isinstance(nodes_single, str):
+            processed_nodes = nodes_single.split()
+        elif isinstance(nodes_single, list) and nodes_single and isinstance(nodes_single[0], str):
+            processed_nodes = nodes_single
+
+        if text_pair is None:
+            text = processed_nodes
+        else:
+            text_pair = processed_nodes
+
+        if xpaths is None:
+            length = len(processed_nodes) if hasattr(processed_nodes, "__len__") else 0
+            xpaths = [placeholder_xpath] * length
+
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_xpaths = [xpaths]
+        batched_node_labels = [node_labels] if node_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            xpaths=batched_xpaths,
+            node_labels=batched_node_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
+            stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
             verbose=verbose,
             **kwargs,
         )
 
-        tokens = []
-        pair_tokens = []
-        xpath_tags_seq = []
-        xpath_subs_seq = []
-        pair_xpath_tags_seq = []
-        pair_xpath_subs_seq = []
-        labels = []
-
-        if text_pair is None:
-            if node_labels is None:
-                # CASE 1: web page classification (training + inference) + CASE 2: token classification (inference)
-                for word, xpath in zip(text, xpaths):
-                    if len(word) < 1:  # skip empty nodes
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    xpath_tags_list, xpath_subs_list = self.get_xpath_seq(xpath)
-                    xpath_tags_seq.extend([xpath_tags_list] * len(word_tokens))
-                    xpath_subs_seq.extend([xpath_subs_list] * len(word_tokens))
-            else:
-                # CASE 2: token classification (training)
-                for word, xpath, label in zip(text, xpaths, node_labels):
-                    if len(word) < 1:  # skip empty nodes
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    xpath_tags_list, xpath_subs_list = self.get_xpath_seq(xpath)
-                    xpath_tags_seq.extend([xpath_tags_list] * len(word_tokens))
-                    xpath_subs_seq.extend([xpath_subs_list] * len(word_tokens))
-                    if self.only_label_first_subword:
-                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
-                    else:
-                        labels.extend([label] * len(word_tokens))
-        else:
-            # CASE 3: web page question answering (inference)
-            # text = question
-            # text_pair = nodes
-            tokens = self.tokenize(text)
-            xpath_tags_seq = [self.pad_xpath_tags_seq for _ in range(len(tokens))]
-            xpath_subs_seq = [self.pad_xpath_subs_seq for _ in range(len(tokens))]
-
-            for word, xpath in zip(text_pair, xpaths):
-                if len(word) < 1:  # skip empty nodes
-                    continue
-                word_tokens = self.tokenize(word)
-                pair_tokens.extend(word_tokens)
-                xpath_tags_list, xpath_subs_list = self.get_xpath_seq(xpath)
-                pair_xpath_tags_seq.extend([xpath_tags_list] * len(word_tokens))
-                pair_xpath_subs_seq.extend([xpath_subs_list] * len(word_tokens))
-
-        # Create ids + pair_ids
-        ids = self.convert_tokens_to_ids(tokens)
-        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
-
-        if (
-            return_overflowing_tokens
-            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-            and pair_ids is not None
-        ):
-            raise ValueError(
-                "Not possible to return overflowing tokens for pair of sequences with the "
-                "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                "for instance `only_second` or `only_first`."
-            )
-
-        # Compute the total size of the returned encodings
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        overflowing_xpath_tags_seq = []
-        overflowing_xpath_subs_seq = []
-        overflowing_labels = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            (
-                ids,
-                xpath_tags_seq,
-                xpath_subs_seq,
-                pair_ids,
-                pair_xpath_tags_seq,
-                pair_xpath_subs_seq,
-                labels,
-                overflowing_tokens,
-                overflowing_xpath_tags_seq,
-                overflowing_xpath_subs_seq,
-                overflowing_labels,
-            ) = self.truncate_sequences(
-                ids,
-                xpath_tags_seq=xpath_tags_seq,
-                xpath_subs_seq=xpath_subs_seq,
-                pair_ids=pair_ids,
-                pair_xpath_tags_seq=pair_xpath_tags_seq,
-                pair_xpath_subs_seq=pair_xpath_subs_seq,
-                labels=labels,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
             )
 
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
-            )
-
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["overflowing_xpath_tags_seq"] = overflowing_xpath_tags_seq
-            encoded_inputs["overflowing_xpath_subs_seq"] = overflowing_xpath_subs_seq
-            encoded_inputs["overflowing_labels"] = overflowing_labels
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            xpath_tags_ids = self.build_xpath_tags_with_special_tokens(xpath_tags_seq, pair_xpath_tags_seq)
-            xpath_subs_ids = self.build_xpath_subs_with_special_tokens(xpath_subs_seq, pair_xpath_subs_seq)
-            if labels:
-                labels = [self.pad_token_label] + labels + [self.pad_token_label]
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-            xpath_tags_ids = xpath_tags_seq + pair_xpath_tags_seq if pair else xpath_tags_seq
-            xpath_subs_ids = xpath_subs_seq + pair_xpath_subs_seq if pair else xpath_subs_seq
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["xpath_tags_seq"] = xpath_tags_ids
-        encoded_inputs["xpath_subs_seq"] = xpath_subs_ids
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
 
-        if labels:
-            encoded_inputs["labels"] = labels
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        xpath_tags_seq: list[list[int]],
-        xpath_subs_seq: list[list[int]],
-        pair_ids: Optional[list[int]] = None,
-        pair_xpath_tags_seq: Optional[list[list[int]]] = None,
-        pair_xpath_subs_seq: Optional[list[list[int]]] = None,
-        labels: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Args:
-        Truncates a sequence pair in-place following the strategy.
-            ids (`list[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            xpath_tags_seq (`list[list[int]]`):
-                XPath tag IDs of the first sequence.
-            xpath_subs_seq (`list[list[int]]`):
-                XPath sub IDs of the first sequence.
-            pair_ids (`list[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            pair_xpath_tags_seq (`list[list[int]]`, *optional*):
-                XPath tag IDs of the second sequence.
-            pair_xpath_subs_seq (`list[list[int]]`, *optional*):
-                XPath sub IDs of the second sequence.
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to
-            `False`):
-                The strategy to follow for truncation. Can be:
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-        Returns:
-            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
-            of sequences (or a batch of pairs) is provided.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, xpath_tags_seq, xpath_subs_seq, pair_ids, pair_xpath_tags_seq, pair_xpath_subs_seq, [], [], []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
-
-        overflowing_tokens = []
-        overflowing_xpath_tags_seq = []
-        overflowing_xpath_subs_seq = []
-        overflowing_labels = []
-        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
-            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
-        ):
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                overflowing_tokens = ids[-window_len:]
-                overflowing_xpath_tags_seq = xpath_tags_seq[-window_len:]
-                overflowing_xpath_subs_seq = xpath_subs_seq[-window_len:]
-                ids = ids[:-num_tokens_to_remove]
-                xpath_tags_seq = xpath_tags_seq[:-num_tokens_to_remove]
-                xpath_subs_seq = xpath_subs_seq[:-num_tokens_to_remove]
-                labels = labels[:-num_tokens_to_remove]
-            else:
-                error_msg = (
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                )
-                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
-                    error_msg = (
-                        error_msg + "Please select another truncation strategy than "
-                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
-                    )
-                logger.error(error_msg)
-        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            logger.warning(
-                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
-                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
-                "truncation strategy. So the returned list will always be empty even if some "
-                "tokens have been removed."
-            )
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    ids = ids[:-1]
-                    xpath_tags_seq = xpath_tags_seq[:-1]
-                    xpath_subs_seq = xpath_subs_seq[:-1]
-                    labels = labels[:-1]
-                else:
-                    pair_ids = pair_ids[:-1]
-                    pair_xpath_tags_seq = pair_xpath_tags_seq[:-1]
-                    pair_xpath_subs_seq = pair_xpath_subs_seq[:-1]
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                overflowing_tokens = pair_ids[-window_len:]
-                overflowing_xpath_tags_seq = pair_xpath_tags_seq[-window_len:]
-                overflowing_xpath_subs_seq = pair_xpath_subs_seq[-window_len:]
-                pair_ids = pair_ids[:-num_tokens_to_remove]
-                pair_xpath_tags_seq = pair_xpath_tags_seq[:-num_tokens_to_remove]
-                pair_xpath_subs_seq = pair_xpath_subs_seq[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
-
-        return (
-            ids,
-            xpath_tags_seq,
-            xpath_subs_seq,
-            pair_ids,
-            pair_xpath_tags_seq,
-            pair_xpath_subs_seq,
-            labels,
-            overflowing_tokens,
-            overflowing_xpath_tags_seq,
-            overflowing_xpath_subs_seq,
-            overflowing_labels,
-        )
+        return batched_output
 
     def _pad(
         self,
@@ -1465,5 +992,57 @@ def _pad(
 
         return encoded_inputs
 
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoBERTa sequence has the following format:
+        - single sequence: ` X `
+        - pair of sequences: ` A  B `
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+MarkupLMTokenizerFast = MarkupLMTokenizer
+
 
-__all__ = ["MarkupLMTokenizer"]
+__all__ = ["MarkupLMTokenizer", "MarkupLMTokenizerFast"]
diff --git a/src/transformers/models/markuplm/tokenization_markuplm_fast.py b/src/transformers/models/markuplm/tokenization_markuplm_fast.py
deleted file mode 100644
index 4033ef319ff8..000000000000
--- a/src/transformers/models/markuplm/tokenization_markuplm_fast.py
+++ /dev/null
@@ -1,929 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fast tokenization class for MarkupLM. It overwrites 2 methods of the slow tokenizer class, namely _batch_encode_plus
-and _encode_plus, in which the Rust tokenizer is used.
-"""
-
-import json
-from functools import lru_cache
-from typing import Optional, Union
-
-from tokenizers import processors
-
-from ...file_utils import PaddingStrategy, TensorType, add_end_docstrings
-from ...tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
-    AddedToken,
-    BatchEncoding,
-    EncodedInput,
-    PreTokenizedInput,
-    TextInput,
-    TextInputPair,
-    TruncationStrategy,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_markuplm import MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, MarkupLMTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
-    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
-    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
-    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
-    strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class MarkupLMTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a MarkupLM tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
-
-    [`MarkupLMTokenizerFast`] can be used to turn HTML strings into to token-level `input_ids`, `attention_mask`,
-    `token_type_ids`, `xpath_tags_seq` and `xpath_tags_seq`. This tokenizer inherits from [`PreTrainedTokenizer`] which
-    contains most of the main methods.
-
-    Users should refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = MarkupLMTokenizer
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        tags_dict,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        max_depth=50,
-        max_width=1000,
-        pad_width=1001,
-        pad_token_label=-100,
-        only_label_first_subword=True,
-        trim_offsets=False,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tags_dict=tags_dict,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-            max_depth=max_depth,
-            max_width=max_width,
-            pad_width=pad_width,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            **kwargs,
-        )
-        if trim_offsets:
-            # Not implemented yet, because we need to chain two post processors which is not possible yet
-            # We need to wait for https://github.com/huggingface/tokenizers/pull/1005
-            # With `trim_offsets=False` we don't need to do add `processors.ByteLevel(trim_offsets=False)`
-            # because it's not doing anything
-            raise NotImplementedError(
-                "`trim_offsets=True` is not implemented for MarkupLMTokenizerFast. Please set it to False."
-            )
-
-        self.tags_dict = tags_dict
-
-        tokenizer_component = "post_processor"
-        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
-        if tokenizer_component_instance:
-            state = json.loads(tokenizer_component_instance.__getstate__())
-
-            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
-            if "sep" in state:
-                state["sep"] = tuple(state["sep"])
-            if "cls" in state:
-                state["cls"] = tuple(state["cls"])
-
-            changes_to_apply = False
-
-            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-                state["add_prefix_space"] = add_prefix_space
-                changes_to_apply = True
-
-            if changes_to_apply:
-                component_class = getattr(processors, state.pop("type"))
-                new_value = component_class(**state)
-                setattr(self.backend_tokenizer, tokenizer_component, new_value)
-
-        # additional properties
-        self.max_depth = max_depth
-        self.max_width = max_width
-        self.pad_width = pad_width
-        self.unk_tag_id = len(self.tags_dict)
-        self.pad_tag_id = self.unk_tag_id + 1
-        self.pad_xpath_tags_seq = [self.pad_tag_id] * self.max_depth
-        self.pad_xpath_subs_seq = [self.pad_width] * self.max_depth
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
-
-    def get_xpath_seq(self, xpath):
-        """
-        Given the xpath expression of one particular node (like "/html/body/div/li[1]/div/span[2]"), return a list of
-        tag IDs and corresponding subscripts, taking into account max depth.
-        """
-        xpath_tags_list = []
-        xpath_subs_list = []
-
-        xpath_units = xpath.split("/")
-        for unit in xpath_units:
-            if not unit.strip():
-                continue
-            name_subs = unit.strip().split("[")
-            tag_name = name_subs[0]
-            sub = 0 if len(name_subs) == 1 else int(name_subs[1][:-1])
-            xpath_tags_list.append(self.tags_dict.get(tag_name, self.unk_tag_id))
-            xpath_subs_list.append(min(self.max_width, sub))
-
-        xpath_tags_list = xpath_tags_list[: self.max_depth]
-        xpath_subs_list = xpath_subs_list[: self.max_depth]
-        xpath_tags_list += [self.pad_tag_id] * (self.max_depth - len(xpath_tags_list))
-        xpath_subs_list += [self.pad_width] * (self.max_depth - len(xpath_subs_list))
-
-        return xpath_tags_list, xpath_subs_list
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        xpaths: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        node_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with nodes, xpaths and optional labels.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                words).
-            text_pair (`list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
-                (pretokenized string).
-            xpaths (`list[list[int]]`, `list[list[list[int]]]`):
-                Node-level xpaths. Each bounding box should be normalized to be on a 0-1000 scale.
-            node_labels (`list[int]`, `list[list[int]]`, *optional*):
-                Node-level integer labels (for token classification tasks).
-        """
-
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if text_pair is not None:
-            # in case text + text_pair are provided, text = questions, text_pair = nodes
-            if not _is_valid_text_input(text):
-                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
-            if not isinstance(text_pair, (list, tuple)):
-                raise ValueError(
-                    "Nodes must be of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-        else:
-            # in case only text is provided => must be nodes
-            if not isinstance(text, (list, tuple)):
-                raise ValueError(
-                    "Nodes must be of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-
-        nodes = text if text_pair is None else text_pair
-        assert xpaths is not None, "You must provide corresponding xpaths"
-        if is_batched:
-            assert len(nodes) == len(xpaths), "You must provide nodes and xpaths for an equal amount of examples"
-            for nodes_example, xpaths_example in zip(nodes, xpaths):
-                assert len(nodes_example) == len(xpaths_example), "You must provide as many nodes as there are xpaths"
-        else:
-            assert len(nodes) == len(xpaths), "You must provide as many nodes as there are xpaths"
-
-        if is_batched:
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            is_pair = bool(text_pair is not None)
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                is_pair=is_pair,
-                xpaths=xpaths,
-                node_labels=node_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                xpaths=xpaths,
-                node_labels=node_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        xpaths: Optional[list[list[list[int]]]] = None,
-        node_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._batch_encode_plus(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            xpaths=xpaths,
-            node_labels=node_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        batched_input = [(text, pair)] if pair else [text]
-        encodings = self._tokenizer.encode_batch(
-            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
-        )
-
-        return encodings[0].tokens
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, MARKUPLM_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        xpaths: Optional[list[list[int]]] = None,
-        node_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        `__call__` should be used instead.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus(
-            text=text,
-            xpaths=xpaths,
-            text_pair=text_pair,
-            node_labels=node_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        xpaths: Optional[list[list[list[int]]]] = None,
-        node_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        if not isinstance(batch_text_or_text_pairs, list):
-            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
-
-        # Set the truncation and padding strategy and restore the initial configuration
-        self.set_truncation_and_padding(
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-        )
-
-        if is_pair:
-            batch_text_or_text_pairs = [([text], text_pair) for text, text_pair in batch_text_or_text_pairs]
-
-        encodings = self._tokenizer.encode_batch(
-            batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            is_pretokenized=True,  # we set this to True as MarkupLM always expects pretokenized inputs
-        )
-
-        # Convert encoding to dict
-        # `Tokens` is a tuple of (list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
-        #  list[EncodingFast]) with nested dimensions corresponding to batch, overflows, sequence length
-        tokens_and_encodings = [
-            self._convert_encoding(
-                encoding=encoding,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=True
-                if node_labels is not None
-                else return_offsets_mapping,  # we use offsets to create the labels
-                return_length=return_length,
-                verbose=verbose,
-            )
-            for encoding in encodings
-        ]
-
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
-        sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0]:
-            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
-            sanitized_tokens[key] = stack
-        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
-
-        # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
-        if return_overflowing_tokens:
-            overflow_to_sample_mapping = []
-            for i, (toks, _) in enumerate(tokens_and_encodings):
-                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
-            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-
-        for input_ids in sanitized_tokens["input_ids"]:
-            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-
-        # create the token-level xpaths tags and subscripts
-        xpath_tags_seq = []
-        xpath_subs_seq = []
-        for batch_index in range(len(sanitized_tokens["input_ids"])):
-            if return_overflowing_tokens:
-                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-            else:
-                original_index = batch_index
-            xpath_tags_seq_example = []
-            xpath_subs_seq_example = []
-            for id, sequence_id, word_id in zip(
-                sanitized_tokens["input_ids"][batch_index],
-                sanitized_encodings[batch_index].sequence_ids,
-                sanitized_encodings[batch_index].word_ids,
-            ):
-                if word_id is not None:
-                    if is_pair and sequence_id == 0:
-                        xpath_tags_seq_example.append(self.pad_xpath_tags_seq)
-                        xpath_subs_seq_example.append(self.pad_xpath_subs_seq)
-                    else:
-                        xpath_tags_list, xpath_subs_list = self.get_xpath_seq(xpaths[original_index][word_id])
-                        xpath_tags_seq_example.extend([xpath_tags_list])
-                        xpath_subs_seq_example.extend([xpath_subs_list])
-                else:
-                    if id in [self.cls_token_id, self.sep_token_id, self.pad_token_id]:
-                        xpath_tags_seq_example.append(self.pad_xpath_tags_seq)
-                        xpath_subs_seq_example.append(self.pad_xpath_subs_seq)
-                    else:
-                        raise ValueError("Id not recognized")
-            xpath_tags_seq.append(xpath_tags_seq_example)
-            xpath_subs_seq.append(xpath_subs_seq_example)
-
-        sanitized_tokens["xpath_tags_seq"] = xpath_tags_seq
-        sanitized_tokens["xpath_subs_seq"] = xpath_subs_seq
-
-        # optionally, create the labels
-        if node_labels is not None:
-            labels = []
-            for batch_index in range(len(sanitized_tokens["input_ids"])):
-                if return_overflowing_tokens:
-                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-                else:
-                    original_index = batch_index
-                labels_example = []
-                for id, offset, word_id in zip(
-                    sanitized_tokens["input_ids"][batch_index],
-                    sanitized_tokens["offset_mapping"][batch_index],
-                    sanitized_encodings[batch_index].word_ids,
-                ):
-                    if word_id is not None:
-                        if self.only_label_first_subword:
-                            if offset[0] == 0:
-                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                                labels_example.append(node_labels[original_index][word_id])
-                            else:
-                                labels_example.append(self.pad_token_label)
-                        else:
-                            labels_example.append(node_labels[original_index][word_id])
-                    else:
-                        labels_example.append(self.pad_token_label)
-                labels.append(labels_example)
-
-            sanitized_tokens["labels"] = labels
-            # finally, remove offsets if the user didn't want them
-            if not return_offsets_mapping:
-                del sanitized_tokens["offset_mapping"]
-
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
-
-    def _encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        xpaths: Optional[list[list[int]]] = None,
-        node_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # make it a batched input
-        # 2 options:
-        # 1) only text, in case text must be a list of str
-        # 2) text + text_pair, in which case text = str and text_pair a list of str
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_xpaths = [xpaths]
-        batched_node_labels = [node_labels] if node_labels is not None else None
-        batched_output = self._batch_encode_plus(
-            batched_input,
-            is_pair=bool(text_pair is not None),
-            xpaths=batched_xpaths,
-            node_labels=batched_node_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
-            batched_output = BatchEncoding(
-                {
-                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
-                    for key, value in batched_output.items()
-                },
-                batched_output.encodings,
-            )
-
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
-        return batched_output
-
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Args:
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-            encoded_inputs:
-                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side:
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            padding_side = padding_side if padding_side is not None else self.padding_side
-            if padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "xpath_tags_seq" in encoded_inputs:
-                    encoded_inputs["xpath_tags_seq"] = (
-                        encoded_inputs["xpath_tags_seq"] + [self.pad_xpath_tags_seq] * difference
-                    )
-                if "xpath_subs_seq" in encoded_inputs:
-                    encoded_inputs["xpath_subs_seq"] = (
-                        encoded_inputs["xpath_subs_seq"] + [self.pad_xpath_subs_seq] * difference
-                    )
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "xpath_tags_seq" in encoded_inputs:
-                    encoded_inputs["xpath_tags_seq"] = [self.pad_xpath_tags_seq] * difference + encoded_inputs[
-                        "xpath_tags_seq"
-                    ]
-                if "xpath_subs_seq" in encoded_inputs:
-                    encoded_inputs["xpath_subs_seq"] = [self.pad_xpath_subs_seq] * difference + encoded_inputs[
-                        "xpath_subs_seq"
-                    ]
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(padding_side))
-
-        return encoded_inputs
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A RoBERTa sequence has the following format:
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["MarkupLMTokenizerFast"]
diff --git a/src/transformers/models/mbart/__init__.py b/src/transformers/models/mbart/__init__.py
index 0b90185d2cbe..c55beabfc2cc 100644
--- a/src/transformers/models/mbart/__init__.py
+++ b/src/transformers/models/mbart/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_mbart import *
     from .modeling_mbart import *
     from .tokenization_mbart import *
-    from .tokenization_mbart_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index 852357c9a266..d6753ee8fe41 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -13,34 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
+from ...tokenization_python import AddedToken
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-SPIECE_UNDERLINE = "▁"
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
 
 FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"]  # fmt: skip
 
 
-@requires(backends=("sentencepiece",))
-class MBartTokenizer(PreTrainedTokenizer):
+class MBartTokenizer(TokenizersBackend):
     """
-    Construct an MBART tokenizer.
+    Construct an MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     The tokenization method is `  ` for source language documents, and `
      ` for target language documents.
@@ -50,7 +48,9 @@ class MBartTokenizer(PreTrainedTokenizer):
     ```python
     >>> from transformers import MBartTokenizer
 
-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> tokenizer = MBartTokenizer.from_pretrained(
+    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
+    ... )
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
     >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
@@ -58,13 +58,13 @@ class MBartTokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     prefix_tokens: list[int] = []
     suffix_tokens: list[int] = []
 
     def __init__(
         self,
-        vocab_file,
         bos_token="",
         eos_token="",
         sep_token="",
@@ -72,94 +72,122 @@ def __init__(
         unk_token="",
         pad_token="",
         mask_token="",
-        tokenizer_file=None,
         src_lang=None,
         tgt_lang=None,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
         additional_special_tokens=None,
+        vocab=None,
+        merges=None,  # Ignored for Unigram
+        vocab_file=None,
         **kwargs,
     ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, normalized=False) if isinstance(mask_token, str) else mask_token
-        )
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
+        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
+        if additional_special_tokens is not None:
+            _additional_special_tokens.extend(
+                [t for t in additional_special_tokens if t not in _additional_special_tokens]
+            )
 
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+        # MBart uses fairseq vocab alignment: =0, =1, =2, =3, then SPM pieces[3:], lang codes, 
+        if vocab is not None:
+            # Handle different vocab formats (dict, list of tokens, or list of tuples)
+            # SentencePieceExtractor returns list[tuple[str, float]] which is the expected format
+            if isinstance(vocab, dict):
+                vocab = [(token, 0.0) for token in vocab.keys()]
+            elif isinstance(vocab, list) and len(vocab) > 0:
+                if not isinstance(vocab[0], tuple):
+                    vocab = [(token, 0.0) for token in vocab]
+                else:
+                    # Ensure tuples are (str, float) format
+                    vocab = [(str(item[0]), float(item[1])) for item in vocab]
+
+            # Reorder to fairseq: , , , , ... (rest of vocab from SPM[3:])
+            vocab_list = []
+            vocab_list.append((str(bos_token), 0.0))
+            vocab_list.append((str(pad_token), 0.0))
+            vocab_list.append((str(eos_token), 0.0))
+            vocab_list.append((str(unk_token), 0.0))
+
+            # Add the rest of the SentencePiece vocab (skipping first 3: , , )
+            vocab_list.extend(vocab[4:])
+
+            # Add language codes
+            for lang_code in FAIRSEQ_LANGUAGE_CODES:
+                vocab_list.append((str(lang_code), 0.0))
+
+            # Add mask token
+            vocab_list.append((str(mask_token), 0.0))
+
+            self._vocab_scores = vocab_list
+        else:
+            self._vocab_scores = [
+                (str(bos_token), 0.0),
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(unk_token), 0.0),
+                ("▁", -2.0),
+            ]
+            for lang_code in FAIRSEQ_LANGUAGE_CODES:
+                self._vocab_scores.append((lang_code, 0.0))
+            self._vocab_scores.append((str(mask_token), 0.0))
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab_scores,
+                unk_id=3,
+                byte_fallback=False,
+            )
+        )
 
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
+        self._tokenizer.normalizer = None
 
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.WhitespaceSplit(),
+                pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
+            ]
+        )
 
-        self.sp_model_size = len(self.sp_model)
-        self.lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
-        }
-        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
-        self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
-        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        _additional_special_tokens = list(self.lang_code_to_id.keys())
-
-        if additional_special_tokens is not None:
-            # Only add those special tokens if they are not already there.
-            _additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in _additional_special_tokens]
-            )
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            tokenizer_file=None,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
             additional_special_tokens=_additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
+        self.lang_code_to_id = {
+            lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
+        }
+        self.fairseq_offset = 1
+
+        # Build fairseq token mappings for backward compatibility
+        self.fairseq_tokens_to_ids = {
+            "": 0,
+            "": 1,
+            "": 2,
+            "": 3,
+        }
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_tokens_to_ids[""] = self.convert_tokens_to_ids(str(mask_token))
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
         self._src_lang = src_lang if src_lang is not None else "en_XX"
-        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+        self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
-
     @property
     def src_lang(self) -> str:
         return self._src_lang
@@ -169,88 +197,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
     def _build_translation_inputs(
         self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
     ):
@@ -263,80 +209,43 @@ def _build_translation_inputs(
         inputs["forced_bos_token_id"] = tgt_lang_id
         return inputs
 
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: list[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[list[str]] = None,
-        tgt_lang: str = "ro_RO",
-        **kwargs,
-    ) -> BatchEncoding:
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
-
     def _switch_to_input_mode(self):
         return self.set_src_lang_special_tokens(self.src_lang)
 
     def _switch_to_target_mode(self):
+        if self.tgt_lang is None:
+            self.tgt_lang = self._src_lang
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
-        self.cur_lang_code = self.lang_code_to_id[src_lang]
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
         self.prefix_tokens = []
         self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
     def set_tgt_lang_special_tokens(self, lang: str) -> None:
         """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
-        self.cur_lang_code = self.lang_code_to_id[lang]
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
         self.prefix_tokens = []
         self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
 
 __all__ = ["MBartTokenizer"]
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
deleted file mode 100644
index 7cf4d468c7df..000000000000
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ /dev/null
@@ -1,269 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_mbart import MBartTokenizer
-else:
-    MBartTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN"]  # fmt: skip
-
-
-class MBartTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    The tokenization method is `  ` for source language documents, and `
-     ` for target language documents.
-
-    Examples:
-
-    ```python
-    >>> from transformers import MBartTokenizerFast
-
-    >>> tokenizer = MBartTokenizerFast.from_pretrained(
-    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
-    ... )
-    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
-    ```"""
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = MBartTokenizer
-
-    prefix_tokens: list[int] = []
-    suffix_tokens: list[int] = []
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        src_lang=None,
-        tgt_lang=None,
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        _additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()
-
-        if additional_special_tokens is not None:
-            # Only add those special tokens if they are not already there.
-            _additional_special_tokens.extend(
-                [t for t in additional_special_tokens if t not in _additional_special_tokens]
-            )
-
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            additional_special_tokens=_additional_special_tokens,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-        self.lang_code_to_id = {
-            lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
-        }
-
-        self._src_lang = src_lang if src_lang is not None else "en_XX"
-        self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
-        self.tgt_lang = tgt_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    @property
-    def src_lang(self) -> str:
-        return self._src_lang
-
-    @src_lang.setter
-    def src_lang(self, new_src_lang: str) -> None:
-        self._src_lang = new_src_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An MBART sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def _build_translation_inputs(
-        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
-    ):
-        """Used by translation pipeline, to prepare inputs for the generate function"""
-        if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
-        self.src_lang = src_lang
-        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
-        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
-        inputs["forced_bos_token_id"] = tgt_lang_id
-        return inputs
-
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: list[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[list[str]] = None,
-        tgt_lang: str = "ro_RO",
-        **kwargs,
-    ) -> BatchEncoding:
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
-
-    def _switch_to_input_mode(self):
-        return self.set_src_lang_special_tokens(self.src_lang)
-
-    def _switch_to_target_mode(self):
-        return self.set_tgt_lang_special_tokens(self.tgt_lang)
-
-    def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
-        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
-        self.cur_lang_code = self.convert_tokens_to_ids(lang)
-        self.prefix_tokens = []
-        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["MBartTokenizerFast"]
diff --git a/src/transformers/models/mbart50/__init__.py b/src/transformers/models/mbart50/__init__.py
index f7cd8c28da63..e66676802277 100644
--- a/src/transformers/models/mbart50/__init__.py
+++ b/src/transformers/models/mbart50/__init__.py
@@ -19,7 +19,6 @@
 
 if TYPE_CHECKING:
     from .tokenization_mbart50 import *
-    from .tokenization_mbart50_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 413beaa03a83..6883bfb89b67 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -13,37 +13,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, BatchEncoding
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-SPIECE_UNDERLINE = "▁"
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
 
 FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]  # fmt: skip
 
 
-@requires(backends=("sentencepiece",))
-class MBart50Tokenizer(PreTrainedTokenizer):
+class MBart50Tokenizer(TokenizersBackend):
     """
-    Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a MBart50 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
         src_lang (`str`, *optional*):
             A string representing the source language.
@@ -66,21 +64,6 @@ class MBart50Tokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `""`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
 
     Examples:
 
@@ -96,13 +79,13 @@ class MBart50Tokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     prefix_tokens: list[int] = []
     suffix_tokens: list[int] = []
 
     def __init__(
         self,
-        vocab_file,
         src_lang=None,
         tgt_lang=None,
         eos_token="",
@@ -111,66 +94,157 @@ def __init__(
         unk_token="",
         pad_token="",
         mask_token="",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        vocab=None,
+        merges=None,  # Ignored for Unigram
+        vocab_file=None,
         **kwargs,
-    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it
+    ):
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.vocab_file = vocab_file
 
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
-        kwargs["additional_special_tokens"] += [
-            code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
-        ]
+        # Do not pass language codes via extra_special_tokens to super().__init__.
+        # We will mark them as special AFTER backend construction to avoid re-adding tokens
+        # when loading from pretrained files.
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
+        # Always construct a tokenizer_object without referencing external tokenizer files
+        if vocab is not None:
+            # MBart50 uses fairseq vocab alignment matching MBart50Converter:
+            # =0, =1, =2, =3, then tokens, lang codes, 
 
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+            vocab = [(str(item[0]), float(item[1])) for item in vocab]
 
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
+            vocab_tokens = [item[0] for item in vocab]
+            has_language_codes = any(lang_code in vocab_tokens for lang_code in FAIRSEQ_LANGUAGE_CODES)
 
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
+            if has_language_codes:
+                self._vocab_scores = vocab
+            else:
+                # Vocab from SentencePieceExtractor is in sentencepiece format:
+                # =0, =1, =2, then tokens
+                # We need to reorder to fairseq format: =0, =1, =2, =3, then tokens
+
+                # Reorder: fairseq expects , , , , then rest of vocab starting from index 3
+                vocab_list = [
+                    (str(cls_token), 0.0),  # 0: 
+                    (str(pad_token), 0.0),  # 1: 
+                    (str(eos_token), 0.0),  # 2: 
+                    (str(unk_token), 0.0),  # 3: 
+                ]
+                # Add remaining tokens from position 3 onwards (skip , ,  from sentencepiece)
+                vocab_list.extend(vocab[3:])
+
+                # Add language codes
+                for lang_code in FAIRSEQ_LANGUAGE_CODES:
+                    vocab_list.append((str(lang_code), 0.0))
+
+                # Add mask token
+                vocab_list.append((str(mask_token), 0.0))
+
+                self._vocab_scores = vocab_list
+        else:
+            # Minimal fallback: small vocab with specials and language codes
+            self._vocab_scores = [
+                (str(cls_token), 0.0),
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(unk_token), 0.0),
+                ("▁", -2.0),
+            ]
+            for lang_code in FAIRSEQ_LANGUAGE_CODES:
+                self._vocab_scores.append((lang_code, 0.0))
+            self._vocab_scores.append((str(mask_token), 0.0))
+
+        # Build backend tokenizer from self._vocab_scores (both branches above set it)
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab_scores,
+                unk_id=3,
+                byte_fallback=False,
+            )
+        )
 
-        self.sp_model_size = len(self.sp_model)
-        self.lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(FAIRSEQ_LANGUAGE_CODES)
-        }
-        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
-        self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+        # Set normalizer equivalent to Precompiled + Strip + Replace from tokenizer.json
+        # When loading from pretrained, this will be overridden by the tokenizer.json config
+        # When creating from extractor (vocab), this provides equivalent behavior
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace(Regex(r"[\n\r\t]"), " "),  # Precompiled converts newlines/tabs to spaces
+                normalizers.NFKC(),  # Precompiled does NFKC normalization
+                normalizers.Strip(left=False, right=True),  # Strip trailing whitespace (matches tokenizer.json)
+                normalizers.Replace(
+                    Regex(r" {2,}"), "▁"
+                ),  # Replace multiple spaces with underscore (matches tokenizer.json)
+            ]
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
-        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
         super().__init__(
+            tokenizer_object=self._tokenizer,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
+        self.fairseq_offset = 1
+
+        # Mark language codes as extra special tokens without re-adding them to the backend.
+        # Merge with any pre-existing extra_special_tokens (e.g., restored from config on load).
+        try:
+            lang_tokens = [AddedToken(code, special=True) for code in FAIRSEQ_LANGUAGE_CODES]
+        except Exception:
+            lang_tokens = list(FAIRSEQ_LANGUAGE_CODES)
+        existing_extra = getattr(self, "_extra_special_tokens", []) or []
+        # Preserve order: keep existing, append missing language codes
+        existing_strs = {str(t) for t in existing_extra}
+        merged_extra = list(existing_extra) + [t for t in lang_tokens if str(t) not in existing_strs]
+        self._extra_special_tokens = merged_extra
+
         self._src_lang = src_lang if src_lang is not None else "en_XX"
-        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
         self.tgt_lang = tgt_lang
+
+        # Build language code mappings and fairseq mappings
+        # This will be called again in _post_init after tokenizer.json is loaded
+        self._build_language_code_mappings()
+
+        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
         self.set_src_lang_special_tokens(self._src_lang)
 
-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1  # Plus 1 for the mask token
+    def _build_language_code_mappings(self):
+        """Build language code to ID mappings and fairseq compatibility mappings."""
+        self.lang_code_to_id = {
+            lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
+        }
+        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
+
+        # Build fairseq token mappings for backward compatibility
+        self.fairseq_tokens_to_ids = {
+            "": 0,
+            "": 1,
+            "": 2,
+            "": 3,
+        }
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        mask_token = getattr(self, "mask_token", "")
+        self.fairseq_tokens_to_ids[""] = self.convert_tokens_to_ids(str(mask_token))
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+    def _post_init(self):
+        """Called after tokenizer.json is loaded in from_pretrained."""
+        # Rebuild language code mappings with the loaded tokenizer
+        self._build_language_code_mappings()
+        # Update cur_lang_code_id with the correct ID
+        if hasattr(self, "_src_lang"):
+            self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
+            self.set_src_lang_special_tokens(self._src_lang)
 
     @property
     def src_lang(self) -> str:
@@ -181,150 +255,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def __getstate__(self) -> dict:
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d: dict) -> None:
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def get_vocab(self) -> dict:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
-        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def _build_translation_inputs(
-        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
-    ):
-        """Used by translation pipeline, to prepare inputs for the generate function"""
-        if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
-        self.src_lang = src_lang
-        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
-        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
-        inputs["forced_bos_token_id"] = tgt_lang_id
-        return inputs
-
     def prepare_seq2seq_batch(
         self,
         src_texts: list[str],
@@ -341,19 +271,54 @@ def _switch_to_input_mode(self):
         return self.set_src_lang_special_tokens(self.src_lang)
 
     def _switch_to_target_mode(self):
+        if self.tgt_lang is None:
+            self.tgt_lang = self._src_lang
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang: str) -> None:
         """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
-        self.cur_lang_code_id = self.lang_code_to_id[src_lang]
+        self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
         self.prefix_tokens = [self.cur_lang_code_id]
         self.suffix_tokens = [self.eos_token_id]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
     def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
         """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos]."""
-        self.cur_lang_code_id = self.lang_code_to_id[tgt_lang]
+        self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
         self.prefix_tokens = [self.cur_lang_code_id]
         self.suffix_tokens = [self.eos_token_id]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
 
 __all__ = ["MBart50Tokenizer"]
+
+# Backward alias
+MBart50TokenizerFast = MBart50Tokenizer
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
deleted file mode 100644
index 985b0929f87c..000000000000
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_mbart50 import MBart50Tokenizer
-else:
-    MBart50Tokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", "tr_TR", "vi_VN", "zh_CN", "af_ZA", "az_AZ", "bn_IN", "fa_IR", "he_IL", "hr_HR", "id_ID", "ka_GE", "km_KH", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "pl_PL", "ps_AF", "pt_XX", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "uk_UA", "ur_PK", "xh_ZA", "gl_ES", "sl_SI"]  # fmt: skip
-
-
-class MBart50TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's *tokenizers* library). Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        src_lang (`str`, *optional*):
-            A string representing the source language.
-        tgt_lang (`str`, *optional*):
-            A string representing the target language.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-
-    Examples:
-
-    ```python
-    >>> from transformers import MBart50TokenizerFast
-
-    >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-    >>> # model(**model_inputs) should work
-    ```"""
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = MBart50Tokenizer
-
-    prefix_tokens: list[int] = []
-    suffix_tokens: list[int] = []
-
-    def __init__(
-        self,
-        vocab_file=None,
-        src_lang=None,
-        tgt_lang=None,
-        tokenizer_file=None,
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
-        kwargs["additional_special_tokens"] += [
-            code for code in FAIRSEQ_LANGUAGE_CODES if code not in kwargs["additional_special_tokens"]
-        ]
-
-        super().__init__(
-            vocab_file,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            tokenizer_file=tokenizer_file,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-        self.lang_code_to_id = {
-            lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
-        }
-
-        self._src_lang = src_lang if src_lang is not None else "en_XX"
-        self.tgt_lang = tgt_lang
-        self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    @property
-    def src_lang(self) -> str:
-        return self._src_lang
-
-    @src_lang.setter
-    def src_lang(self, new_src_lang: str) -> None:
-        self._src_lang = new_src_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An MBART-50 sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
-        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: list[str],
-        src_lang: str = "en_XX",
-        tgt_texts: Optional[list[str]] = None,
-        tgt_lang: str = "ro_RO",
-        **kwargs,
-    ) -> BatchEncoding:
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
-
-    def _switch_to_input_mode(self):
-        return self.set_src_lang_special_tokens(self.src_lang)
-
-    def _switch_to_target_mode(self):
-        return self.set_tgt_lang_special_tokens(self.tgt_lang)
-
-    def set_src_lang_special_tokens(self, src_lang: str) -> None:
-        """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
-        self.cur_lang_code_id = self.convert_tokens_to_ids(src_lang)
-        self.prefix_tokens = [self.cur_lang_code_id]
-        self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
-        """Reset the special tokens to the target language setting. prefix=[src_lang_code] and suffix=[eos]."""
-        self.cur_lang_code_id = self.convert_tokens_to_ids(tgt_lang)
-        self.prefix_tokens = [self.cur_lang_code_id]
-        self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def _build_translation_inputs(
-        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
-    ):
-        """Used by translation pipeline, to prepare inputs for the generate function"""
-        if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
-        self.src_lang = src_lang
-        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
-        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
-        inputs["forced_bos_token_id"] = tgt_lang_id
-        return inputs
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["MBart50TokenizerFast"]
diff --git a/src/transformers/models/mgp_str/tokenization_mgp_str.py b/src/transformers/models/mgp_str/tokenization_mgp_str.py
index f29b8a8348ee..af33ca785329 100644
--- a/src/transformers/models/mgp_str/tokenization_mgp_str.py
+++ b/src/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -18,7 +18,7 @@
 import os
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -60,6 +60,7 @@ def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s
             bos_token=bos_token,
             eos_token=eos_token,
             pad_token=pad_token,
+            special_tokens_pattern="none",
             **kwargs,
         )
 
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index b66f86a8808b..14d7565409cc 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -18,13 +18,12 @@
 import json
 import os
 from collections.abc import Mapping
-from shutil import copyfile
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
 import numpy as np
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     AddedToken,
@@ -37,8 +36,8 @@
     TruncationStrategy,
     to_py_obj,
 )
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import add_end_docstrings, is_torch_tensor, logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
@@ -129,8 +128,7 @@
 """
 
 
-@requires(backends=("sentencepiece",))
-class MLukeTokenizer(PreTrainedTokenizer):
+class MLukeTokenizer(TokenizersBackend):
     """
     Adapted from [`XLMRobertaTokenizer`] and [`LukeTokenizer`]. Based on
     [SentencePiece](https://github.com/google/sentencepiece).
@@ -220,8 +218,6 @@ class MLukeTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
-        entity_vocab_file,
         bos_token="",
         eos_token="",
         sep_token="",
@@ -238,14 +234,14 @@ def __init__(
         entity_pad_token="[PAD]",
         entity_mask_token="[MASK]",
         entity_mask2_token="[MASK2]",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        vocab: Optional[list] = None,
+        entity_vocab: Optional[dict] = None,
         **kwargs,
     ) -> None:
         # Mask token behave like a normal word, i.e. include the space before it
         mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
         # we add 2 special tokens for downstream tasks
-        # for more information about lstrip and rstrip, see https://github.com/huggingface/transformers/pull/2778
         entity_token_1 = (
             AddedToken(entity_token_1, lstrip=False, rstrip=False)
             if isinstance(entity_token_1, str)
@@ -256,14 +252,38 @@ def __init__(
             if isinstance(entity_token_2, str)
             else entity_token_2
         )
-        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
-        additional_special_tokens += [entity_token_1, entity_token_2]
 
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        # Handle entity vocab file for backward compatibility
+        entity_vocab_file = kwargs.pop("entity_vocab_file", None)
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
+        # Check if vocab/entity_vocab are in kwargs
+        if vocab is None and "vocab" in kwargs:
+            vocab = kwargs.pop("vocab")
+        if entity_vocab is None and "entity_vocab" in kwargs:
+            entity_vocab = kwargs.pop("entity_vocab")
+
+        # Build vocab from data (list of (token, score) tuples)
+        if vocab is not None:
+            # vocab is list of (token, score) tuples from SentencePieceExtractor
+            self._vocab = [(token, float(score)) for token, score in vocab]
+            self._vocab_size = len(self._vocab)
+        else:
+            # Create minimal vocab with  to satisfy Unigram requirements
+            self._vocab = [("", 0.0)]
+            self._vocab_size = 0  # Will be updated when real vocab is loaded
+
+        # Build Unigram tokenizer
+        self._tokenizer = Tokenizer(Unigram(self._vocab, unk_id=0))
+
+        # Add SentencePiece-style normalization and pre-tokenization
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace("``", '"'),
+                normalizers.Replace("''", '"'),
+            ]
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always")
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always")
 
         # Original fairseq vocab and spm vocab must be "aligned":
         # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
@@ -271,22 +291,34 @@ def __init__(
         # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
         # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
 
-        # Mimic fairseq token-to-id alignment for the first 4 token
+        # Mimic fairseq token-to-id alignment for the first 4 tokens
         self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
 
         # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
         self.fairseq_offset = 1
 
-        self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_tokens_to_ids[""] = self._vocab_size + self.fairseq_offset
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
-        with open(entity_vocab_file, encoding="utf-8") as entity_vocab_handle:
-            self.entity_vocab = json.load(entity_vocab_handle)
+        # Load entity vocab
+        if entity_vocab is not None:
+            self.entity_vocab = entity_vocab
+        elif entity_vocab_file is not None:
+            with open(entity_vocab_file, encoding="utf-8") as entity_vocab_handle:
+                self.entity_vocab = json.load(entity_vocab_handle)
+        else:
+            # Create minimal entity vocab with required special tokens
+            self.entity_vocab = {
+                entity_unk_token: 0,
+                entity_pad_token: 1,
+                entity_mask_token: 2,
+                entity_mask2_token: 3,
+            }
+
         for entity_special_token in [entity_unk_token, entity_pad_token, entity_mask_token, entity_mask2_token]:
             if entity_special_token not in self.entity_vocab:
                 raise ValueError(
-                    f"Specified entity special token ``{entity_special_token}`` is not found in entity_vocab. "
-                    f"Probably an incorrect entity vocab file is loaded: {entity_vocab_file}."
+                    f"Specified entity special token ``{entity_special_token}`` is not found in entity_vocab."
                 )
         self.entity_unk_token_id = self.entity_vocab[entity_unk_token]
         self.entity_pad_token_id = self.entity_vocab[entity_pad_token]
@@ -308,7 +340,35 @@ def __init__(
 
         self.max_mention_length = max_mention_length
 
+        # Handle extra/legacy special tokens (v4 compat). The fallback load path can pass
+        # `additional_special_tokens` and/or `extra_special_tokens`, with entries serialized as dicts.
+        extra_tokens: list[AddedToken | str] = []
+        for key in ("extra_special_tokens", "additional_special_tokens"):
+            tokens = kwargs.pop(key, None)
+            if isinstance(tokens, (list, tuple)):
+                for token in tokens:
+                    extra_tokens.append(AddedToken(**token) if isinstance(token, dict) else token)
+
+        # Ensure MLuke entity tokens are present exactly once.
+        seen = {str(token) for token in extra_tokens}
+        for token in (entity_token_1, entity_token_2):
+            token_str = str(token)
+            if token_str not in seen:
+                extra_tokens.append(token)
+                seen.add(token_str)
+
+        # Also register entity masking/padding tokens so they survive save/load cycles.
+        for token in (entity_unk_token, entity_pad_token, entity_mask_token, entity_mask2_token):
+            if token not in seen:
+                extra_tokens.append(AddedToken(token, lstrip=False, rstrip=False, normalized=False, special=True))
+                seen.add(token)
+
+        kwargs["extra_special_tokens"] = extra_tokens
+
+        tokenizer_object = self._tokenizer
+
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
             unk_token=unk_token,
@@ -316,75 +376,93 @@ def __init__(
             cls_token=cls_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
             task=task,
             max_entity_length=max_entity_length,
             max_mention_length=max_mention_length,
-            entity_token_1=entity_token_1,
-            entity_token_2=entity_token_2,
+            entity_token_1=str(entity_token_1),
+            entity_token_2=str(entity_token_2),
             entity_unk_token=entity_unk_token,
             entity_pad_token=entity_pad_token,
             entity_mask_token=entity_mask_token,
             entity_mask2_token=entity_mask2_token,
-            additional_special_tokens=additional_special_tokens,
+            entity_vocab=entity_vocab if entity_vocab_file is None else None,  # Only store if passed as data
             **kwargs,
         )
 
+        # Call _post_init for tokenizers created directly (not from_pretrained)
+        self._post_init()
+
+    def _post_init(self):
+        """
+        Post-initialization to configure the post-processor for MLuke's special token format.
+        """
+        super()._post_init()
+        # Ensure the Python-side vocab metadata matches the fast tokenizer backend after loading
+        self._vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=False)
+        self.fairseq_tokens_to_ids[""] = self._vocab_size + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        # Configure post processor for XLM-R/MLuke format:
+        # single:  X 
+        # pair:  A  B 
+        from tokenizers import processors
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.cls_token}:0 $A:0 {self.sep_token}:0",
+            pair=f"{self.cls_token}:0 $A:0 {self.sep_token}:0 {self.sep_token}:0 $B:1 {self.sep_token}:1",
+            special_tokens=[
+                (self.cls_token, self.cls_token_id),
+                (self.sep_token, self.sep_token_id),
+            ],
+        )
+
     @property
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.vocab_size
     def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset + 1  # Add the  token
+        return self._vocab_size + self.fairseq_offset + 1  # Add the  token
 
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_vocab
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._tokenize
-    def _tokenize(self, text: str) -> list[str]:
-        # TODO check if the t5/llama PR also applies here
-        return self.sp_model.encode(text, out_type=str)
-
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
 
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+        # Look up token in vocab
+        token_id = self._tokenizer.token_to_id(token)
+
+        # Need to return unknown token if not found (token_to_id returns None)
+        return token_id + self.fairseq_offset if token_id is not None else self.unk_token_id
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         if index in self.fairseq_ids_to_tokens:
             return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+        token = self._tokenizer.id_to_token(index - self.fairseq_offset)
+        return token if token is not None else self.unk_token
 
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (strings for sub-words) in a single string."""
         out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         return out_string
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
 
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
+        Args:
+            pair (`bool`, *optional*, defaults to `False`):
+                Whether the number of added tokens should be computed in the case of a sequence pair or a single
+                sequence.
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+        Returns:
+            `int`: Number of special tokens added to sequences.
+        """
+        return 4 if pair else 2
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer.__call__
     def __call__(
         self,
         text: Union[TextInput, list[TextInput]],
@@ -412,6 +490,36 @@ def __call__(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
+        # Check for seq2seq parameters that are not supported with entity-aware encoding
+        if kwargs.get("text_target") is not None or kwargs.get("text_pair_target") is not None:
+            if entity_spans is not None or entities is not None or self.task is not None:
+                raise NotImplementedError(
+                    "text_target and text_pair_target are not supported when using entity-aware encoding. "
+                    "Please use the tokenizer without entities for seq2seq tasks."
+                )
+            # Delegate to parent for seq2seq encoding
+            return super().__call__(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
         """
         Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
         sequences, depending on the task you want to prepare them for.
@@ -453,7 +561,9 @@ def __call__(
         """
         # Input type checking for clearer error
         is_valid_single_text = isinstance(text, str)
-        is_valid_batch_text = isinstance(text, (list, tuple)) and (len(text) == 0 or (isinstance(text[0], str)))
+        is_valid_batch_text = isinstance(text, (list, tuple)) and (
+            len(text) == 0 or isinstance(text[0], (str, list, tuple))
+        )
         if not (is_valid_single_text or is_valid_batch_text):
             raise ValueError("text input must be of type `str` (single example) or `list[str]` (batch).")
 
@@ -466,6 +576,16 @@ def __call__(
 
         is_batched = bool(isinstance(text, (list, tuple)))
 
+        # Get proper padding and truncation strategies
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
         if is_batched:
             batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
             if entities is None:
@@ -482,13 +602,13 @@ def __call__(
                     list(zip(entity_spans, entity_spans_pair)) if entity_spans_pair is not None else entity_spans
                 )
 
-            return self.batch_encode_plus(
+            return self._batch_encode_plus(
                 batch_text_or_text_pairs=batch_text_or_text_pairs,
                 batch_entity_spans_or_entity_spans_pairs=batch_entity_spans_or_entity_spans_pairs,
                 batch_entities_or_entities_pairs=batch_entities_or_entities_pairs,
                 add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
                 max_length=max_length,
                 max_entity_length=max_entity_length,
                 stride=stride,
@@ -506,7 +626,7 @@ def __call__(
                 **kwargs,
             )
         else:
-            return self.encode_plus(
+            return self._encode_plus(
                 text=text,
                 text_pair=text_pair,
                 entity_spans=entity_spans,
@@ -514,8 +634,8 @@ def __call__(
                 entities=entities,
                 entities_pair=entities_pair,
                 add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
                 max_length=max_length,
                 max_entity_length=max_entity_length,
                 stride=stride,
@@ -533,7 +653,6 @@ def __call__(
                 **kwargs,
             )
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._encode_plus
     def _encode_plus(
         self,
         text: Union[TextInput],
@@ -561,6 +680,35 @@ def _encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
+        if (
+            entity_spans is None
+            and entity_spans_pair is None
+            and entities is None
+            and entities_pair is None
+            and self.task is None
+        ):
+            return super()._encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -616,7 +764,6 @@ def _encode_plus(
             verbose=verbose,
         )
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._batch_encode_plus
     def _batch_encode_plus(
         self,
         batch_text_or_text_pairs: Union[list[TextInput], list[TextInputPair]],
@@ -645,6 +792,41 @@ def _batch_encode_plus(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
+        if (
+            batch_entity_spans_or_entity_spans_pairs is None
+            and batch_entities_or_entities_pairs is None
+            and self.task is None
+        ):
+            if batch_text_or_text_pairs and isinstance(batch_text_or_text_pairs[0], (tuple, list)):
+                texts, text_pairs = zip(*batch_text_or_text_pairs)
+                texts = list(texts)
+                text_pairs = list(text_pairs)
+            else:
+                texts = batch_text_or_text_pairs
+                text_pairs = None
+
+            return super()._encode_plus(
+                text=texts,
+                text_pair=text_pairs,
+                add_special_tokens=add_special_tokens,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
         if return_offsets_mapping:
             raise NotImplementedError(
                 "return_offset_mapping is not available when using Python tokenizers. "
@@ -727,7 +909,6 @@ def _batch_encode_plus(
 
         return BatchEncoding(batch_outputs)
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._check_entity_input_format
     def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spans: Optional[EntitySpanInput]):
         if not isinstance(entity_spans, list):
             raise TypeError("entity_spans should be given as a list")
@@ -746,7 +927,6 @@ def _check_entity_input_format(self, entities: Optional[EntityInput], entity_spa
             if len(entities) != len(entity_spans):
                 raise ValueError("If you specify entities, entities and entity_spans must be the same length")
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._create_input_sequence
     def _create_input_sequence(
         self,
         text: Union[TextInput],
@@ -758,7 +938,9 @@ def _create_input_sequence(
         **kwargs,
     ) -> tuple[list, list, list, list, list, list]:
         def get_input_ids(text):
-            tokens = self.tokenize(text, **kwargs)
+            # Use the underlying tokenizer directly to avoid infinite recursion
+            # Then convert to fairseq-aligned IDs
+            tokens = self._tokenizer.encode(text, add_special_tokens=False).tokens
             return self.convert_tokens_to_ids(tokens)
 
         def get_input_ids_and_entity_token_spans(text, entity_spans):
@@ -900,7 +1082,6 @@ def get_input_ids_and_entity_token_spans(text, entity_spans):
         )
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._batch_prepare_for_model
     def _batch_prepare_for_model(
         self,
         batch_ids_pairs: list[tuple[list[int], None]],
@@ -986,7 +1167,6 @@ def _batch_prepare_for_model(
         return batch_outputs
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer.prepare_for_model
     def prepare_for_model(
         self,
         ids: list[int],
@@ -1218,7 +1398,6 @@ def prepare_for_model(
 
         return batch_outputs
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer.pad
     def pad(
         self,
         encoded_inputs: Union[
@@ -1383,7 +1562,6 @@ def pad(
 
         return BatchEncoding(batch_outputs, tensor_type=return_tensors)
 
-    # Copied from transformers.models.luke.tokenization_luke.LukeTokenizer._pad
     def _pad(
         self,
         encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
@@ -1528,21 +1706,13 @@ def _pad(
 
         return encoded_inputs
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str, str]:
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Save only the entity vocabulary file. The tokenizer.json is saved by the parent TokenizersBackend.
+        """
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
+            return ()
 
         entity_vocab_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
@@ -1551,9 +1721,8 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         with open(entity_vocab_file, "w", encoding="utf-8") as f:
             f.write(json.dumps(self.entity_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
 
-        return out_vocab_file, entity_vocab_file
+        return (entity_vocab_file,)
 
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -1580,7 +1749,6 @@ def build_inputs_with_special_tokens(
         sep = [self.sep_token_id]
         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
     ) -> list[int]:
@@ -1609,7 +1777,6 @@ def get_special_tokens_mask(
             return [1] + ([0] * len(token_ids_0)) + [1]
         return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
 
-    # Copied from transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
diff --git a/src/transformers/models/mobilebert/__init__.py b/src/transformers/models/mobilebert/__init__.py
index 0066f7f2b382..9548f0ea07ac 100644
--- a/src/transformers/models/mobilebert/__init__.py
+++ b/src/transformers/models/mobilebert/__init__.py
@@ -20,8 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_mobilebert import *
     from .modeling_mobilebert import *
-    from .tokenization_mobilebert import *
-    from .tokenization_mobilebert_fast import *
+    from .tokenization_mobilebert import MobileBertTokenizer, MobileBertTokenizerFast
 else:
     import sys
 
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index 88628400ca49..6764442ce774 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -1,6 +1,5 @@
 # coding=utf-8
-#
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,472 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for MobileBERT."""
-
-import collections
-import os
-import unicodedata
-from typing import Optional
-
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with BERT->MobileBERT,Bert->MobileBert
-class MobileBertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a MobileBERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original MobileBERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = MobileBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A MobileBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
+"""Tokenization classes for MobileBERT model."""
 
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
+from ..bert.tokenization_bert import BertTokenizer
 
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
 
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+# MobileBertTokenizer is an alias for BertTokenizer
+MobileBertTokenizer = BertTokenizer
 
+# MobileBertTokenizerFast is an alias for MobileBertTokenizer (since BertTokenizer is already a fast tokenizer)
+MobileBertTokenizerFast = MobileBertTokenizer
 
-__all__ = ["MobileBertTokenizer"]
+__all__ = ["MobileBertTokenizer", "MobileBertTokenizerFast"]
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
deleted file mode 100644
index 0f97ddbbb703..000000000000
--- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# coding=utf-8
-#
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for MobileBERT."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_mobilebert import MobileBertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with BERT->MobileBERT,Bert->MobileBert
-class MobileBertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original MobileBERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = MobileBertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A MobileBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["MobileBertTokenizerFast"]
diff --git a/src/transformers/models/mpnet/__init__.py b/src/transformers/models/mpnet/__init__.py
index 402cc164b979..f76c1b3f538a 100644
--- a/src/transformers/models/mpnet/__init__.py
+++ b/src/transformers/models/mpnet/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_mpnet import *
     from .modeling_mpnet import *
     from .tokenization_mpnet import *
-    from .tokenization_mpnet_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index bf035cf8e4bd..0cd914aa3c9c 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -15,58 +15,35 @@
 # limitations under the License.
 """Tokenization classes for MPNet."""
 
-import collections
-import os
-import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import WordPiece
+
+from ...tokenization_python import AddedToken
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
 
 
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
+class MPNetTokenizer(TokenizersBackend):
+    r"""
+    Construct a MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-
-class MPNetTokenizer(PreTrainedTokenizer):
-    """
-
-    This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should refer to the
-    superclass for more information regarding methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
+        vocab (`dict`, *optional*):
+            Dictionary mapping tokens to their IDs. If not provided, an empty vocab is initialized.
         do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
         bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
             
 
@@ -101,16 +78,11 @@ class MPNetTokenizer(PreTrainedTokenizer):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
             value for `lowercase` (as in the original BERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -118,10 +90,8 @@ class MPNetTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
+        vocab: Optional[dict] = None,
         do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
         bos_token="",
         eos_token="",
         sep_token="",
@@ -131,407 +101,106 @@ def __init__(
         mask_token="",
         tokenize_chinese_chars=True,
         strip_accents=None,
-        clean_up_tokenization_spaces=True,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+        # Initialize vocab
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {}
+
+        # Initialize the tokenizer with WordPiece model
+        self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
+
+        # Set normalizer based on MPNetConverter logic
+        self._tokenizer.normalizer = normalizers.BertNormalizer(
+            clean_text=True,
+            handle_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=do_lower_case,
+        )
+
+        # Set pre-tokenizer
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+
+        # Set decoder
+        self._tokenizer.decoder = decoders.WordPiece(prefix="##")
+
+        # Store do_lower_case for later use
+        self.do_lower_case = do_lower_case
+
+        # Handle special token initialization
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
         # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
 
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        # Store for later use
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
 
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
+        # Set post_processor after super().__init__ to ensure we have token IDs
+        cls_str = str(self.cls_token)
+        sep_str = str(self.sep_token)
+        cls_token_id = self.cls_token_id if self.cls_token_id is not None else 0
+        sep_token_id = self.sep_token_id if self.sep_token_id is not None else 2
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls_str}:0 $A:0 {sep_str}:0",
+            pair=f"{cls_str}:0 $A:0 {sep_str}:0 {sep_str}:0 $B:1 {sep_str}:1",  # MPNet uses two [SEP] tokens
+            special_tokens=[
+                (cls_str, cls_token_id),
+                (sep_str, sep_token_id),
+            ],
+        )
 
     @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        # "" is part of the vocab, but was wrongfully added at a wrong index in the fast saved version
-        vocab = self.added_tokens_encoder.copy()
-        vocab.update(self.vocab)
-        return vocab
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
+    def mask_token(self) -> str:
         """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A MPNet sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
+        having been set.
 
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the **.
         """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
+        if self._mask_token is None:
+            if self.verbose:
+                logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @mask_token.setter
+    def mask_token(self, value):
         """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` methods.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Set to True if the token list is already formatted with special tokens for the model
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
+        Overriding the default behavior of the mask token to have it eat the space before it.
 
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
+        This is needed to preserve backward compatibility with all the previously used models based on MPNet.
         """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
 
 
 __all__ = ["MPNetTokenizer"]
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
deleted file mode 100644
index 1a470565a845..000000000000
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team, Microsoft Corporation.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for MPNet."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_mpnet import MPNetTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class MPNetTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = MPNetTokenizer
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="[UNK]",
-        pad_token="",
-        mask_token="",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
-            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
-        ):
-            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
-            pre_tok_state["lowercase"] = do_lower_case
-            pre_tok_state["strip_accents"] = strip_accents
-            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
-
-        self.do_lower_case = do_lower_case
-
-    @property
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-
-        MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the **.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @mask_token.setter
-    def mask_token(self, value):
-        """
-        Overriding the default behavior of the mask token to have it eat the space before it.
-
-        This is needed to preserve backward compatibility with all the previously used models based on MPNet.
-        """
-        # Mask token behave like a normal word, i.e. include the space before it
-        # So we set lstrip to True
-        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
-        self._mask_token = value
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. MPNet does not
-        make use of token type ids, therefore a list of zeros is returned
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["MPNetTokenizerFast"]
diff --git a/src/transformers/models/mt5/tokenization_mt5.py b/src/transformers/models/mt5/tokenization_mt5.py
deleted file mode 100644
index a3058816ff20..000000000000
--- a/src/transformers/models/mt5/tokenization_mt5.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The T5 Authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""mT5 tokenization file"""
-
-from ..t5 import T5Tokenizer
-
-
-class MT5Tokenizer(T5Tokenizer):
-    pass
-
-
-__all__ = ["MT5Tokenizer"]
diff --git a/src/transformers/models/mt5/tokenization_mt5_fast.py b/src/transformers/models/mt5/tokenization_mt5_fast.py
deleted file mode 100644
index 8737088cc442..000000000000
--- a/src/transformers/models/mt5/tokenization_mt5_fast.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# coding=utf-8
-# Copyright 2020, The T5 Authors and HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""mT5 tokenization file"""
-
-from ..t5 import T5TokenizerFast
-
-
-class MT5TokenizerFast(T5TokenizerFast):
-    pass
-
-
-__all__ = ["MT5TokenizerFast"]
diff --git a/src/transformers/models/mvp/__init__.py b/src/transformers/models/mvp/__init__.py
index beab37f65c1a..4f0d850d308f 100644
--- a/src/transformers/models/mvp/__init__.py
+++ b/src/transformers/models/mvp/__init__.py
@@ -18,10 +18,9 @@
 
 
 if TYPE_CHECKING:
+    from ..roberta.tokenization_roberta import RobertaTokenizer as MvpTokenizer
     from .configuration_mvp import *
     from .modeling_mvp import *
-    from .tokenization_mvp import *
-    from .tokenization_mvp_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/mvp/tokenization_mvp.py b/src/transformers/models/mvp/tokenization_mvp.py
deleted file mode 100644
index f6039df2dc02..000000000000
--- a/src/transformers/models/mvp/tokenization_mvp.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from functools import lru_cache
-from typing import Optional
-
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-
-# See all MVP models at https://huggingface.co/models?filter=mvp
-
-
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class MvpTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a MVP tokenizer, which is smilar to the RoBERTa tokenizer, using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import MvpTokenizer
-
-    >>> tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (MVP tokenizer detect beginning of words by the preceding space).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
-
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-        super().__init__(
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        vocab = self.encoder.copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A MVP sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
-
-__all__ = ["MvpTokenizer"]
diff --git a/src/transformers/models/mvp/tokenization_mvp_fast.py b/src/transformers/models/mvp/tokenization_mvp_fast.py
deleted file mode 100644
index ca0bc6b165f7..000000000000
--- a/src/transformers/models/mvp/tokenization_mvp_fast.py
+++ /dev/null
@@ -1,274 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils_base import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_mvp import MvpTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-# See all MVP models at https://huggingface.co/models?filter=mvp
-
-
-class MvpTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" MVP tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer,
-    using byte-level Byte-Pair-Encoding.
-
-    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import MvpTokenizerFast
-
-    >>> tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp")
-    >>> tokenizer("Hello world")["input_ids"]
-    [0, 31414, 232, 2]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [0, 20920, 232, 2]
-    ```
-
-    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
-    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
-
-    
-
-    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
-
-    
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (MVP tokenizer detect beginning of words by the preceding space).
-        trim_offsets (`bool`, *optional*, defaults to `True`):
-            Whether the post processing step should trim offsets to avoid including whitespaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = MvpTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
-        trim_offsets=True,
-        **kwargs,
-    ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            errors=errors,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            add_prefix_space=add_prefix_space,
-            trim_offsets=trim_offsets,
-            **kwargs,
-        )
-
-        # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__`
-        tokenizer_component = "post_processor"
-        tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None)
-        if tokenizer_component_instance:
-            state = json.loads(tokenizer_component_instance.__getstate__())
-
-            # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class`
-            if "sep" in state:
-                state["sep"] = tuple(state["sep"])
-            if "cls" in state:
-                state["cls"] = tuple(state["cls"])
-
-            changes_to_apply = False
-
-            if state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
-                state["add_prefix_space"] = add_prefix_space
-                changes_to_apply = True
-
-            if state.get("trim_offsets", trim_offsets) != trim_offsets:
-                state["trim_offsets"] = trim_offsets
-                changes_to_apply = True
-
-            if changes_to_apply:
-                component_class = getattr(processors, state.pop("type"))
-                new_value = component_class(**state)
-                setattr(self.backend_tokenizer, tokenizer_component, new_value)
-
-    @property
-    def mask_token(self) -> str:
-        """
-        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
-        having been set.
-
-        MVP tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the **.
-        """
-        if self._mask_token is None:
-            if self.verbose:
-                logger.error("Using mask_token, but it is not set yet.")
-            return None
-        return str(self._mask_token)
-
-    @mask_token.setter
-    def mask_token(self, value):
-        """
-        Overriding the default behavior of the mask token to have it eat the space before it.
-
-        This is needed to preserve backward compatibility with all the previously used models based on Mvp.
-        """
-        # Mask token behave like a normal word, i.e. include the space before it
-        # So we set lstrip to True
-        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
-        self._mask_token = value
-
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        if is_split_into_words and not self.add_prefix_space:
-            raise ValueError(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-                "to use it with pretokenized inputs."
-            )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        if is_split_into_words and not self.add_prefix_space:
-            raise ValueError(
-                f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-                "to use it with pretokenized inputs."
-            )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return output
-
-        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MVP does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-
-__all__ = ["MvpTokenizerFast"]
diff --git a/src/transformers/models/myt5/tokenization_myt5.py b/src/transformers/models/myt5/tokenization_myt5.py
index 251e3d602b99..2d8d0ef27612 100644
--- a/src/transformers/models/myt5/tokenization_myt5.py
+++ b/src/transformers/models/myt5/tokenization_myt5.py
@@ -20,7 +20,7 @@
 from collections import defaultdict
 from typing import Optional, Union
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/nllb/__init__.py b/src/transformers/models/nllb/__init__.py
index 5cdb326098a3..9fe4c4b7f938 100644
--- a/src/transformers/models/nllb/__init__.py
+++ b/src/transformers/models/nllb/__init__.py
@@ -19,7 +19,6 @@
 
 if TYPE_CHECKING:
     from .tokenization_nllb import *
-    from .tokenization_nllb_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index 4962a642bb31..21e62cabfd6f 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -13,34 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, BatchEncoding
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-SPIECE_UNDERLINE = "▁"
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
 
 FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']  # fmt: skip
 
 
-@requires(backends=("sentencepiece",))
-class NllbTokenizer(PreTrainedTokenizer):
+class NllbTokenizer(TokenizersBackend):
     """
-    Construct an NLLB tokenizer.
+    Construct an NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     The tokenization method is `  ` for source language documents, and `
      ` for target language documents.
@@ -59,62 +57,39 @@ class NllbTokenizer(PreTrainedTokenizer):
     ```
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
         bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
+            The beginning of sequence token that was used during pretraining.
         eos_token (`str`, *optional*, defaults to `""`):
             The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
         sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
+            The separator token.
         cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+            The classifier token.
         unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
+            The unknown token.
         pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
+            The token used for padding.
         mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenizer_file (`str`, *optional*):
-            The path to a tokenizer file to use instead of the vocab file.
+            The token used for masking values.
         src_lang (`str`, *optional*):
             The language to use as source language for translation.
         tgt_lang (`str`, *optional*):
             The language to use as target language for translation.
-        sp_model_kwargs (`dict[str, str]`):
-            Additional keyword arguments to pass to the model initialization.
+        legacy_behaviour (`bool`, *optional*, defaults to `False`):
+            Whether to use legacy behaviour (suffix pattern) or new behaviour (prefix pattern).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     prefix_tokens: list[int] = []
     suffix_tokens: list[int] = []
 
     def __init__(
         self,
-        vocab_file,
         bos_token="",
         eos_token="",
         sep_token="",
@@ -122,87 +97,103 @@ def __init__(
         unk_token="",
         pad_token="",
         mask_token="",
-        tokenizer_file=None,
         src_lang=None,
         tgt_lang=None,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
         additional_special_tokens=None,
         legacy_behaviour=False,
+        vocab=None,
+        merges=None,
+        vocab_file=None,
         **kwargs,
     ):
         if additional_special_tokens is None:
-            additional_special_tokens = FAIRSEQ_LANGUAGE_CODES
-        bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
-        pad_token = AddedToken(pad_token, normalized=False, special=True) if isinstance(pad_token, str) else pad_token
-        eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
-        # Mask token behave like a normal word, i.e. include the space before it
+            additional_special_tokens = kwargs.get("extra_special_tokens", FAIRSEQ_LANGUAGE_CODES)
+
+        self.vocab_file = vocab_file
+
         mask_token = (
             AddedToken(mask_token, normalized=True, lstrip=True, special=True)
             if isinstance(mask_token, str)
             else mask_token
         )
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         self.legacy_behaviour = legacy_behaviour
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4   |  5   |  6   |   7  |   8  |  9
-        # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ----
-        # fairseq  | ''   | '' | '' | '' | 'an' | '▁n' | '▁m' | '▁t' | '▁k' | '▁a'
-        # spm      | '' | ''   | '' | 'an'    | '▁n' | '▁m' | '▁t' | '▁k' | '▁a' | '▁s'
-
-        # unk token needs to be in the vocab with correct index
-        self._added_tokens_decoder = {0: bos_token, 1: pad_token, 2: eos_token, 3: unk_token}
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
-        self.sp_model_size = len(self.sp_model)
+        if vocab is not None:
+            if isinstance(vocab, list):
+                self._vocab = {token: idx for idx, (token, _score) in enumerate(vocab)}
+            else:
+                self._vocab = vocab
+        else:
+            self._vocab = {
+                str(bos_token): 0,
+                str(pad_token): 1,
+                str(eos_token): 2,
+                str(unk_token): 3,
+            }
+
+        if merges is None:
+            self._merges = []
+        else:
+            self._merges = merges
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                unk_token=str(unk_token),
+                fuse_unk=True,
+                byte_fallback=False,
+            )
+        )
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace(Regex(r"[\n\r\t]"), " "),
+                normalizers.NFKC(),
+                normalizers.Replace(Regex(r" {2,}"), " "),
+            ]
+        )
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
+
+        tokenizer_object = self._tokenizer
+
+        # Remove extra_special_tokens from kwargs if present to avoid conflict
+        kwargs.pop("extra_special_tokens", None)
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
-            mask_token=mask_token,
-            tokenizer_file=tokenizer_file,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
+            mask_token=mask_token,
+            extra_special_tokens=additional_special_tokens,
             legacy_behaviour=legacy_behaviour,
             **kwargs,
         )
 
+        # Build fairseq mappings for backward compatibility
+        self.fairseq_offset = 1
+        self.fairseq_tokens_to_ids = {
+            "": 0,
+            "": 1,
+            "": 2,
+            "": 3,
+        }
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
         self._src_lang = src_lang if src_lang is not None else "eng_Latn"
-        self.cur_lang_code_id = self.convert_tokens_to_ids(self._src_lang)
+        self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
         self.tgt_lang = tgt_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset
-
     @property
     def src_lang(self) -> str:
         return self._src_lang
@@ -212,88 +203,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
     def _build_translation_inputs(
         self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
     ):
@@ -306,62 +215,66 @@ def _build_translation_inputs(
         inputs["forced_bos_token_id"] = tgt_lang_id
         return inputs
 
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        spm_id = self.sp_model.PieceToId(token)
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
     def prepare_seq2seq_batch(
         self,
         src_texts: list[str],
         src_lang: str = "eng_Latn",
         tgt_texts: Optional[list[str]] = None,
         tgt_lang: str = "fra_Latn",
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: Optional[str] = None,
+        truncation: bool = True,
         **kwargs,
     ) -> BatchEncoding:
         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+        if max_length is None:
+            max_length = self.model_max_length
+
+        model_inputs = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+
+        if tgt_texts is None:
+            return model_inputs
+
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+
+        # Switch to target mode to set the right special tokens
+        self._switch_to_target_mode()
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )
+        model_inputs["labels"] = labels["input_ids"]
+
+        # Switch back to input mode
+        self._switch_to_input_mode()
+
+        return model_inputs
 
     def _switch_to_input_mode(self):
         return self.set_src_lang_special_tokens(self.src_lang)
 
     def _switch_to_target_mode(self):
+        if self.tgt_lang is None:
+            self.tgt_lang = self._src_lang
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
@@ -370,6 +283,7 @@ def set_src_lang_special_tokens(self, src_lang) -> None:
         - In default mode: Prefix=[src_lang_code], suffix = [eos]
         """
         self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+
         if self.legacy_behaviour:
             self.prefix_tokens = []
             self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
@@ -377,6 +291,15 @@ def set_src_lang_special_tokens(self, src_lang) -> None:
             self.prefix_tokens = [self.cur_lang_code]
             self.suffix_tokens = [self.eos_token_id]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
     def set_tgt_lang_special_tokens(self, lang: str) -> None:
         """Reset the special tokens to the target lang setting.
         - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
@@ -390,5 +313,14 @@ def set_tgt_lang_special_tokens(self, lang: str) -> None:
             self.prefix_tokens = [self.cur_lang_code]
             self.suffix_tokens = [self.eos_token_id]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
 
 __all__ = ["NllbTokenizer"]
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
deleted file mode 100644
index 5300b3942b5d..000000000000
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ /dev/null
@@ -1,327 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from tokenizers import processors
-
-from ...tokenization_utils import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_nllb import NllbTokenizer
-else:
-    NllbTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-FAIRSEQ_LANGUAGE_CODES = ['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'aka_Latn', 'amh_Ethi', 'apc_Arab', 'arb_Arab', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'pes_Arab', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'kat_Geor', 'knc_Arab', 'knc_Latn', 'kaz_Cyrl', 'kbp_Latn', 'kea_Latn', 'khm_Khmr', 'kik_Latn', 'kin_Latn', 'kir_Cyrl', 'kmb_Latn', 'kon_Latn', 'kor_Hang', 'kmr_Latn', 'lao_Laoo', 'lvs_Latn', 'lij_Latn', 'lim_Latn', 'lin_Latn', 'lit_Latn', 'lmo_Latn', 'ltg_Latn', 'ltz_Latn', 'lua_Latn', 'lug_Latn', 'luo_Latn', 'lus_Latn', 'mag_Deva', 'mai_Deva', 'mal_Mlym', 'mar_Deva', 'min_Latn', 'mkd_Cyrl', 'plt_Latn', 'mlt_Latn', 'mni_Beng', 'khk_Cyrl', 'mos_Latn', 'mri_Latn', 'zsm_Latn', 'mya_Mymr', 'nld_Latn', 'nno_Latn', 'nob_Latn', 'npi_Deva', 'nso_Latn', 'nus_Latn', 'nya_Latn', 'oci_Latn', 'gaz_Latn', 'ory_Orya', 'pag_Latn', 'pan_Guru', 'pap_Latn', 'pol_Latn', 'por_Latn', 'prs_Arab', 'pbt_Arab', 'quy_Latn', 'ron_Latn', 'run_Latn', 'rus_Cyrl', 'sag_Latn', 'san_Deva', 'sat_Beng', 'scn_Latn', 'shn_Mymr', 'sin_Sinh', 'slk_Latn', 'slv_Latn', 'smo_Latn', 'sna_Latn', 'snd_Arab', 'som_Latn', 'sot_Latn', 'spa_Latn', 'als_Latn', 'srd_Latn', 'srp_Cyrl', 'ssw_Latn', 'sun_Latn', 'swe_Latn', 'swh_Latn', 'szl_Latn', 'tam_Taml', 'tat_Cyrl', 'tel_Telu', 'tgk_Cyrl', 'tgl_Latn', 'tha_Thai', 'tir_Ethi', 'taq_Latn', 'taq_Tfng', 'tpi_Latn', 'tsn_Latn', 'tso_Latn', 'tuk_Latn', 'tum_Latn', 'tur_Latn', 'twi_Latn', 'tzm_Tfng', 'uig_Arab', 'ukr_Cyrl', 'umb_Latn', 'urd_Arab', 'uzn_Latn', 'vec_Latn', 'vie_Latn', 'war_Latn', 'wol_Latn', 'xho_Latn', 'ydd_Hebr', 'yor_Latn', 'yue_Hant', 'zho_Hans', 'zho_Hant', 'zul_Latn']  # fmt: skip
-
-
-class NllbTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" NLLB tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    The tokenization method is `  ` for source language documents, and `
-     ` for target language documents.
-
-    Examples:
-
-    ```python
-    >>> from transformers import NllbTokenizerFast
-
-    >>> tokenizer = NllbTokenizerFast.from_pretrained(
-    ...     "facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="fra_Latn"
-    ... )
-    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
-    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
-    ```
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenizer_file (`str`, *optional*):
-            The path to a tokenizer file to use instead of the vocab file.
-        src_lang (`str`, *optional*):
-            The language to use as source language for translation.
-        tgt_lang (`str`, *optional*):
-            The language to use as target language for translation.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = NllbTokenizer
-
-    prefix_tokens: list[int] = []
-    suffix_tokens: list[int] = []
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        src_lang=None,
-        tgt_lang=None,
-        additional_special_tokens=None,
-        legacy_behaviour=False,
-        **kwargs,
-    ):
-        if additional_special_tokens is None:
-            additional_special_tokens = FAIRSEQ_LANGUAGE_CODES
-
-        self.vocab_file = vocab_file
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = (
-            AddedToken(mask_token, normalized=True, lstrip=True, special=True)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
-        self.legacy_behaviour = legacy_behaviour
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            legacy_behaviour=legacy_behaviour,
-            **kwargs,
-        )
-
-        self._src_lang = src_lang if src_lang is not None else "eng_Latn"
-        self.cur_lang_code = self.convert_tokens_to_ids(self._src_lang)
-        self.tgt_lang = tgt_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    @property
-    def src_lang(self) -> str:
-        return self._src_lang
-
-    @src_lang.setter
-    def src_lang(self, new_src_lang: str) -> None:
-        self._src_lang = new_src_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An NLLB sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def _build_translation_inputs(
-        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
-    ):
-        """Used by translation pipeline, to prepare inputs for the generate function"""
-        if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
-        self.src_lang = src_lang
-        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
-        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
-        inputs["forced_bos_token_id"] = tgt_lang_id
-        return inputs
-
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: list[str],
-        src_lang: str = "eng_Latn",
-        tgt_texts: Optional[list[str]] = None,
-        tgt_lang: str = "fra_Latn",
-        **kwargs,
-    ) -> BatchEncoding:
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
-
-    def _switch_to_input_mode(self):
-        return self.set_src_lang_special_tokens(self.src_lang)
-
-    def _switch_to_target_mode(self):
-        return self.set_tgt_lang_special_tokens(self.tgt_lang)
-
-    def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting.
-        - In legacy mode: No prefix and suffix=[eos, src_lang_code].
-        - In default mode: Prefix=[src_lang_code], suffix = [eos]
-        """
-        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-
-        if self.legacy_behaviour:
-            self.prefix_tokens = []
-            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-        else:
-            self.prefix_tokens = [self.cur_lang_code]
-            self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target lang setting.
-        - In legacy mode: No prefix and suffix=[eos, tgt_lang_code].
-        - In default mode: Prefix=[tgt_lang_code], suffix = [eos]
-        """
-        self.cur_lang_code = self.convert_tokens_to_ids(lang)
-        if self.legacy_behaviour:
-            self.prefix_tokens = []
-            self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]
-        else:
-            self.prefix_tokens = [self.cur_lang_code]
-            self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["NllbTokenizerFast"]
diff --git a/src/transformers/models/nougat/__init__.py b/src/transformers/models/nougat/__init__.py
index 6cd3208bfa20..0d44fcdb70cc 100644
--- a/src/transformers/models/nougat/__init__.py
+++ b/src/transformers/models/nougat/__init__.py
@@ -21,7 +21,7 @@
     from .image_processing_nougat import *
     from .image_processing_nougat_fast import *
     from .processing_nougat import *
-    from .tokenization_nougat_fast import *
+    from .tokenization_nougat import *
 else:
     import sys
 
diff --git a/src/transformers/models/nougat/tokenization_nougat_fast.py b/src/transformers/models/nougat/tokenization_nougat.py
similarity index 80%
rename from src/transformers/models/nougat/tokenization_nougat_fast.py
rename to src/transformers/models/nougat/tokenization_nougat.py
index 6ec22509b5d9..9bc66c1d01e2 100644
--- a/src/transformers/models/nougat/tokenization_nougat_fast.py
+++ b/src/transformers/models/nougat/tokenization_nougat.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fast tokenizer class for Nougat.
+Tokenizer class for Nougat.
 """
 
 import re
@@ -22,9 +22,10 @@
 from typing import Optional, Union
 
 import numpy as np
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
-
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import is_levenshtein_available, is_nltk_available, logging, requires_backends
 
 
@@ -37,8 +38,7 @@
 
 logger = logging.get_logger(__name__)
 
-
-VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 
 def markdown_compatible(text: str) -> str:
@@ -346,18 +346,19 @@ def remove_slice_from_lines(lines, clean_text, slice) -> str:
     return to_delete.strip()
 
 
-class NougatTokenizerFast(PreTrainedTokenizerFast):
+class NougatTokenizer(TokenizersBackend):
     """
-    Fast tokenizer for Nougat (backed by HuggingFace tokenizers library).
+    Tokenizer for Nougat (backed by HuggingFace tokenizers library).
 
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods. This class mainly adds Nougat-specific
     methods for postprocessing the generated text.
 
     Args:
         vocab_file (`str`, *optional*):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
         tokenizer_file (`str`, *optional*):
             [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
             contains everything needed to load the tokenizer.
@@ -378,6 +379,12 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
 
         pad_token (`str`, *optional*, defaults to `""`):
             The token used for padding, for example when batching sequences of different lengths.
+
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -386,26 +393,127 @@ class NougatTokenizerFast(PreTrainedTokenizerFast):
 
     def __init__(
         self,
-        vocab_file=None,
-        tokenizer_file=None,
-        clean_up_tokenization_spaces=False,
-        unk_token="",
-        bos_token="",
-        eos_token="",
-        pad_token="",
+        errors: str = "replace",
+        unk_token: str = "",
+        bos_token: str = "",
+        eos_token: str = "",
+        pad_token: str = "",
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(bos_token): 0,
+                str(pad_token): 1,
+                str(eos_token): 2,
+                str(unk_token): 3,
+                "[START_REF]": 4,
+            }
+
+        if merges is not None:
+            self._merges = merges
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
+        )
+
+        self._tokenizer.normalizer = normalizers.NFKC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(pattern="SPL1T-TH1S-Pl3A5E", behavior="removed", invert=False),
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.Split(
+                    pattern=r"[\(\)\[\]\{\}]|([!\"#\$%\&'\*\+,\-\./:;<=>\?\\\^_`\|\~])\1*",
+                    behavior="isolated",
+                    invert=False,
+                ),
+                pre_tokenizers.Split(pattern="\n", behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True),
+            ]
+        )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
+
+        # Set up post processor with bos and eos tokens
+        bos_token_id = self._vocab.get(str(bos_token), 0)
+        eos_token_id = self._vocab.get(str(eos_token), 2)
+        pad_token_id = self._vocab.get(str(pad_token), 1)
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{bos_token}:0 $A:0 {eos_token}:0",
+            pair="$A:0 $B:1",
+            special_tokens=[
+                (str(eos_token), eos_token_id),
+                (str(bos_token), bos_token_id),
+            ],
+        )
+
+        # Enable truncation and padding
+        self._tokenizer.enable_truncation(max_length=4096)
+        self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(pad_token))
+
+        tokenizer_object = self._tokenizer
+
         super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            tokenizer_object=tokenizer_object,
+            errors=errors,
             unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
             pad_token=pad_token,
             **kwargs,
         )
-        self.vocab_file = vocab_file
+
+    def _post_init(self):
+        """Post-initialization to ensure tokenizer settings are applied correctly."""
+        # Re-apply settings to ensure they're correct after loading from pretrained
+        self._tokenizer.normalizer = normalizers.NFKC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(pattern="SPL1T-TH1S-Pl3A5E", behavior="removed", invert=False),
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.Split(
+                    pattern=r"[\(\)\[\]\{\}]|([!\"#\$%\&'\*\+,\-\./:;<=>\?\\\^_`\|\~])\1*",
+                    behavior="isolated",
+                    invert=False,
+                ),
+                pre_tokenizers.Split(pattern="\n", behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True),
+            ]
+        )
+        self._tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)
+
+        # Set up post processor with bos and eos tokens
+        bos_token_id = self.bos_token_id if self.bos_token_id is not None else 0
+        eos_token_id = self.eos_token_id if self.eos_token_id is not None else 2
+        pad_token_id = self.pad_token_id if self.pad_token_id is not None else 1
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.bos_token}:0 $A:0 {self.eos_token}:0",
+            pair="$A:0 $B:1",
+            special_tokens=[
+                (str(self.eos_token), eos_token_id),
+                (str(self.bos_token), bos_token_id),
+            ],
+        )
+
+        # Enable truncation and padding
+        self._tokenizer.enable_truncation(max_length=4096)
+        self._tokenizer.enable_padding(length=4096, pad_id=pad_token_id, pad_token=str(self.pad_token))
+
+        # Call parent to handle AddedToken properties
+        super()._post_init()
 
     def remove_hallucinated_references(self, text: str) -> str:
         """
@@ -604,4 +712,4 @@ def post_process_generation(
             return self.post_process_single(generation, fix_markdown=fix_markdown)
 
 
-__all__ = ["NougatTokenizerFast"]
+__all__ = ["NougatTokenizer"]
diff --git a/src/transformers/models/openai/__init__.py b/src/transformers/models/openai/__init__.py
index 98a22135ea40..16248677beae 100644
--- a/src/transformers/models/openai/__init__.py
+++ b/src/transformers/models/openai/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_openai import *
     from .modeling_openai import *
     from .tokenization_openai import *
-    from .tokenization_openai_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index 8a9184cc395c..3ad31681bd78 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -14,383 +14,130 @@
 # limitations under the License.
 """Tokenization classes for OpenAI GPT."""
 
-import json
-import os
-import re
-import unicodedata
-from typing import Optional
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...convert_slow_tokenizer import generate_merges
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
-    strings)
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-def text_standardize(text):
+class OpenAIGPTTokenizer(TokenizersBackend):
     """
-    fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
-    """
-    text = text.replace("—", "-")
-    text = text.replace("–", "-")
-    text = text.replace("―", "-")
-    text = text.replace("…", "...")
-    text = text.replace("´", "'")
-    text = re.sub(r"""(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)""", r" \1 ", text)
-    text = re.sub(r"\s*\n\s*", " \n ", text)
-    text = re.sub(r"[^\S\n]+", " ", text)
-    return text.strip()
-
-
-class OpenAIGPTTokenizer(PreTrainedTokenizer):
-    """
-    Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
+    Construct a GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
+    the following peculiarities:
 
-    - lowercases all inputs,
-    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
-      `BasicTokenizer` if not.
+    - lower case all inputs
+    - uses BERT's BasicTokenizer for pre-BPE tokenization
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to a tokenizers JSON file containing the serialization of a tokenizer.
         unk_token (`str`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, an empty list is used.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
 
-    def __init__(self, vocab_file, merges_file, unk_token="", **kwargs):
-        try:
-            import ftfy
-            from spacy.lang.en import English
-
-            _nlp = English()
-            self.nlp = _nlp.tokenizer
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True)
-            self.fix_text = None
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-        super().__init__(unk_token=unk_token, **kwargs)
-
-    @property
-    def do_lower_case(self):
-        return True
-
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + "",)
-        if token in self.cache:
-            return self.cache[token]
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token + ""
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        if word == "\n  ":
-            word = "\n"
-        self.cache[token] = word
-        return word
+    def __init__(
+        self,
+        unk_token="",
+        vocab=None,
+        merges=None,
+        vocab_file=None,
+        merges_file=None,
+        **kwargs,
+    ):
+        # Initialize vocabulary
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            # Initialize minimal vocabulary with unk token
+            self._vocab = {str(unk_token): 0}
 
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        split_tokens = []
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer
-            text = self.nlp.tokenize(text)
-            for token in text:
-                split_tokens.extend(list(self.bpe(token).split(" ")))
+        # Initialize merges
+        if merges is not None:
+            self._merges = merges if merges is not None else generate_merges(self._vocab)
         else:
-            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
-            text = self.nlp(text_standardize(self.fix_text(text)))
-            for token in text:
-                split_tokens.extend(list(self.bpe(token.text.lower()).split(" ")))
-        return split_tokens
+            self._merges = []
+
+        # Create BPE tokenizer
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+                unk_token=str(unk_token),
+            )
+        )
 
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
+        # Set normalizer and pre-tokenizer to mimic OpenAI GPT behavior
+        # OpenAI GPT uses BERT BasicTokenizer with lower_case=True
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.NFD(),
+                normalizers.Lowercase(),
+                normalizers.StripAccents(),
+            ]
+        )
 
-    def _convert_id_to_token(self, index):
-        """Converts an id in a token (BPE) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+        self._tokenizer.decoder = decoders.BPEDecoder(suffix="")
 
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = "".join(tokens).replace("", " ").strip()
-        return out_string
+        tokenizer_object = self._tokenizer
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
+            **kwargs,
         )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+
+        self.vocab_file = vocab_file
+        self.merges_file = merges_file
+
+    def _post_init(self):
+        """Post-initialization to ensure tokenizer settings are applied correctly."""
+        # Re-apply settings to ensure they're correct after loading from pretrained
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.NFD(),
+                normalizers.Lowercase(),
+                normalizers.StripAccents(),
+            ]
         )
 
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+        self._tokenizer.decoder = decoders.BPEDecoder(suffix="")
 
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
+        # Call parent to handle AddedToken properties
+        super()._post_init()
 
-        return vocab_file, merge_file
+    @property
+    def do_lower_case(self):
+        return True
 
 
 __all__ = ["OpenAIGPTTokenizer"]
diff --git a/src/transformers/models/openai/tokenization_openai_fast.py b/src/transformers/models/openai/tokenization_openai_fast.py
deleted file mode 100644
index 83edf5eafa94..000000000000
--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for OpenAI GPT."""
-
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_openai import OpenAIGPTTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-
-
-class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
-    the following peculiarities:
-
-    - lower case all inputs
-    - uses BERT's BasicTokenizer for pre-BPE tokenization
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = OpenAIGPTTokenizer
-
-    def __init__(self, vocab_file=None, merges_file=None, tokenizer_file=None, unk_token="", **kwargs):
-        super().__init__(vocab_file, merges_file, tokenizer_file=tokenizer_file, unk_token=unk_token, **kwargs)
-
-    @property
-    def do_lower_case(self):
-        return True
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["OpenAIGPTTokenizerFast"]
diff --git a/src/transformers/models/parakeet/tokenization_parakeet_fast.py b/src/transformers/models/parakeet/tokenization_parakeet_fast.py
index d53eb9c68ad4..97eb286e1177 100644
--- a/src/transformers/models/parakeet/tokenization_parakeet_fast.py
+++ b/src/transformers/models/parakeet/tokenization_parakeet_fast.py
@@ -16,7 +16,7 @@
 import itertools
 from typing import Optional, Union
 
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
 
 
 class ParakeetTokenizerFast(PreTrainedTokenizerFast):
diff --git a/src/transformers/models/pegasus/__init__.py b/src/transformers/models/pegasus/__init__.py
index 4070d841ea3d..288776550986 100644
--- a/src/transformers/models/pegasus/__init__.py
+++ b/src/transformers/models/pegasus/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_pegasus import *
     from .modeling_pegasus import *
     from .tokenization_pegasus import *
-    from .tokenization_pegasus_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index b8a4a1c737d1..cd6c7e6f5173 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -12,38 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
-from shutil import copyfile
-from typing import Any, Optional
+"""Tokenization class for model PEGASUS."""
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
-
-
-SPIECE_UNDERLINE = "▁"
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 
 logger = logging.get_logger(__name__)
 
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
 
-# TODO ArthurZ refactor this to only use the added_tokens_encoder
 
-
-@requires(backends=("sentencepiece",))
-class PegasusTokenizer(PreTrainedTokenizer):
+class PegasusTokenizer(TokenizersBackend):
     r"""
-    Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
         pad_token (`str`, *optional*, defaults to `""`):
@@ -76,21 +68,10 @@ class PegasusTokenizer(PreTrainedTokenizer):
              are used as additional special tokens corresponding to the [original PEGASUS
             tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
             that uses the tokens 2 - 104 only for pretraining
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
+        offset (`int`, *optional*, defaults to 103):
+            Offset for additional special tokens.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, a blank vocabulary is initialized.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -98,195 +79,79 @@ class PegasusTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
         pad_token="",
         eos_token="",
         unk_token="",
         mask_token="",
         mask_token_sent="",
         additional_special_tokens=None,
-        offset=103,  # entries 2 - 104 are only used for pretraining
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        offset=103,
+        vocab=None,
+        vocab_file=None,
         **kwargs,
-    ) -> None:
+    ):
         self.offset = offset
-        if additional_special_tokens is not None:
-            if not isinstance(additional_special_tokens, list):
-                raise TypeError(
-                    f"additional_special_tokens should be of type {type(list)}, but is"
-                    f" {type(additional_special_tokens)}"
-                )
-            additional_special_tokens_extended = (
-                ([mask_token_sent] + additional_special_tokens)
-                if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
-                else additional_special_tokens
-            )
-            # fill additional tokens with ...,  in case not all additional tokens are already taken
-            additional_special_tokens_extended += [
-                f"" for i in range(len(additional_special_tokens_extended), self.offset - 1)
-            ]
+        self.vocab_file = vocab_file
 
-            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
-                raise ValueError(
-                    "Please make sure that the provided additional_special_tokens do not contain an incorrectly"
-                    f" shifted list of  tokens. Found {additional_special_tokens_extended}."
-                )
-            additional_special_tokens = additional_special_tokens_extended
-        else:
-            additional_special_tokens_extended = []
+        if additional_special_tokens is None:
             additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
             additional_special_tokens += [f"" for i in range(2, self.offset)]
 
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        self.mask_token_sent = mask_token_sent
-        self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+        if vocab is not None:
+            # For Pegasus, insert special tokens at the beginning
+            special_tokens_set = {pad_token, eos_token, mask_token_sent, mask_token, unk_token}
+            special_tokens_set.update(additional_special_tokens)
 
-        _added_tokens_decoder = {
-            0: AddedToken(str(pad_token), special=True),
-            1: AddedToken(str(eos_token), special=True),
-        }
+            # Build special tokens in correct order
+            _vocab_list = [
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+            ]
+            if mask_token_sent:
+                _vocab_list.append((str(mask_token_sent), 0.0))
+            for token in additional_special_tokens:
+                if token not in [pad_token, eos_token, mask_token_sent]:
+                    _vocab_list.append((str(token), 0.0))
+            if mask_token not in [t for t, _ in _vocab_list]:
+                _vocab_list.append((str(mask_token), 0.0))
+            _vocab_list.append((str(unk_token), 0.0))
+
+            # Filter out special tokens from main vocab and combine
+            filtered_vocab = [(t, s) for t, s in vocab if t not in special_tokens_set]
+            _vocab_list = _vocab_list + filtered_vocab
+        else:
+            _vocab_list = [(str(unk_token), 0.0)]
+
+        self._vocab = {token: idx for idx, (token, _) in enumerate(_vocab_list)}
 
-        if self.mask_token_sent is not None:
-            _added_tokens_decoder[2] = AddedToken(mask_token_sent, special=True)
-            _added_tokens_decoder[3] = AddedToken(str(mask_token), special=True)
+        self._tokenizer = Tokenizer(Unigram(vocab=_vocab_list, unk_id=self._vocab.get(str(unk_token), 0)))
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [normalizers.Replace(Regex(r"\n"), " "), normalizers.Replace(Regex(r" {2,}"), " ")]
+        )
 
-        for i in range(2, self.offset):
-            _added_tokens_decoder[len(_added_tokens_decoder)] = AddedToken(f"", special=True)
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"$A {eos_token}",
+            pair=f"$A $B {eos_token}",
+            special_tokens=[(str(eos_token), self._vocab.get(str(eos_token), 1))],
+        )
 
-        # Force update as we want to make sure vocab is enforced (same as fast)
-        self._added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
-        self._added_tokens_decoder.update(_added_tokens_decoder)
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            pad_token=pad_token,
             eos_token=eos_token,
             unk_token=unk_token,
             mask_token=mask_token,
-            pad_token=pad_token,
             mask_token_sent=mask_token_sent,
             offset=offset,
             additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model) + self.offset
-
-    def get_vocab(self) -> dict[str, int]:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """Converts a token (str) to an id using the vocab."""
-        sp_id = self.sp_model.piece_to_id(token)
-        return sp_id + self.offset
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) to a token (str) using the vocab."""
-        if index < self.offset:
-            return self.sp_model.IdToPiece(index)
-        token = self.sp_model.IdToPiece(index - self.offset)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def num_special_tokens_to_add(self, pair=False):
-        """Just EOS"""
-        return 1
-
-    def _special_token_mask(self, seq):
-        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
-        all_special_ids.remove(self.unk_token_id)  #  is only sometimes special
-
-        return [1 if x in all_special_ids else 0 for x in seq]
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
-        if already_has_special_tokens:
-            return self._special_token_mask(token_ids_0)
-        elif token_ids_1 is None:
-            return self._special_token_mask(token_ids_0) + [1]
-        else:
-            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:
-
-        - single sequence: `X `
-        - pair of sequences: `A B ` (not intended use)
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True)
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
 
 __all__ = ["PegasusTokenizer"]
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
deleted file mode 100644
index 92a37c44ff2e..000000000000
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ /dev/null
@@ -1,215 +0,0 @@
-# coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for model PEGASUS."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_pegasus import PegasusTokenizer
-else:
-    PegasusTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-
-SPIECE_UNDERLINE = "▁"
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-class PegasusTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking single token values. This is the token used when training this model with masked
-            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization](https://huggingface.co/papers/1912.08777).
-        mask_token_sent (`str`, *optional*, defaults to `""`):
-            The token used for masking whole target sentences. This is the token used when training this model with gap
-            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization](https://huggingface.co/papers/1912.08777).
-        additional_special_tokens (`List[str]`, *optional*):
-            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided  and
-             are used as additional special tokens corresponding to the [original PEGASUS
-            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
-            that uses the tokens 2 - 104 only for pretraining
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = PegasusTokenizer
-    model_input_names = ["input_ids", "attention_mask"]
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        pad_token="",
-        eos_token="",
-        unk_token="",
-        mask_token="",
-        mask_token_sent="",
-        additional_special_tokens=None,
-        offset=103,  # entries 2 - 104 are only used for pretraining
-        **kwargs,
-    ):
-        self.offset = offset
-
-        if additional_special_tokens is not None:
-            if not isinstance(additional_special_tokens, list):
-                raise TypeError(
-                    f"additional_special_tokens should be of type {type(list)}, but is"
-                    f" {type(additional_special_tokens)}"
-                )
-
-            additional_special_tokens_extended = (
-                ([mask_token_sent] + additional_special_tokens)
-                if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
-                else additional_special_tokens
-            )
-            # fill additional tokens with ...,  in case not all additional tokens are already taken
-            additional_special_tokens_extended += [
-                f"" for i in range(len(additional_special_tokens_extended), self.offset - 1)
-            ]
-
-            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
-                raise ValueError(
-                    "Please make sure that the provided additional_special_tokens do not contain an incorrectly"
-                    f" shifted list of  tokens. Found {additional_special_tokens_extended}."
-                )
-            additional_special_tokens = additional_special_tokens_extended
-        else:
-            additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
-            additional_special_tokens += [f"" for i in range(2, self.offset)]
-
-        # pegasus was design to support changing the index of the first tokens. If one of the padding/eos/unk/mask token
-        # is different from default, we must rebuild the vocab
-        from_slow = kwargs.pop("from_slow", None)
-        from_slow = from_slow or str(pad_token) != "" or str(eos_token) != "" or str(unk_token) != ""
-
-        kwargs.pop("added_tokens_decoder", {})
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            pad_token=pad_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            mask_token=mask_token,
-            mask_token_sent=mask_token_sent,
-            offset=offset,
-            additional_special_tokens=additional_special_tokens,
-            from_slow=from_slow,
-            **kwargs,
-        )
-        self.vocab_file = vocab_file
-
-    def _special_token_mask(self, seq):
-        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
-        all_special_ids.remove(self.unk_token_id)  #  is only sometimes special
-
-        if all_special_ids != set(range(len(self.additional_special_tokens) + 3)):
-            raise ValueError(
-                "There should be 3 special tokens: mask_token, pad_token, and eos_token +"
-                f" {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
-            )
-
-        return [1 if x in all_special_ids else 0 for x in seq]
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
-        if already_has_special_tokens:
-            return self._special_token_mask(token_ids_0)
-        elif token_ids_1 is None:
-            return self._special_token_mask(token_ids_0) + [1]
-        else:
-            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
-        """
-        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
-
-        - single sequence: `X `
-        - pair of sequences: `A B ` (not intended use)
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["PegasusTokenizerFast"]
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index f17e7e99ac9d..c5d387ca3a7a 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -16,7 +16,7 @@
 
 from typing import Optional
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_python import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index 61ac8194b45c..ec31f8e96c58 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -20,7 +20,7 @@
 from shutil import copyfile
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index b7446cb69684..1fe236339a7c 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -51,16 +51,20 @@ class Pix2StructProcessor(ProcessorMixin):
     Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
     processor.
 
-    [`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5TokenizerFast`]. See
+    [`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5Tokenizer`]. See
     the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
 
     Args:
         image_processor (`Pix2StructImageProcessor`):
             An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+        tokenizer (`T5Tokenizer`):
+            An instance of ['T5Tokenizer`]. The tokenizer is a required input.
     """
 
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "Pix2StructImageProcessor"
+    tokenizer_class = ("T5Tokenizer",)
+
     def __init__(self, image_processor, tokenizer):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
@@ -73,7 +77,7 @@ def __call__(
     ) -> Union[BatchEncoding, BatchFeature]:
         """
         This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`T5TokenizerFast.__call__`] to prepare text for the model.
+        [`T5Tokenizer.__call__`] to prepare text for the model.
 
         Please refer to the docstring of the above two methods for more information.
         """
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index 1eed6dc18bdd..b2cfa67ea2b8 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -13,13 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-from shutil import copyfile
 from typing import Any, Optional
 
-import sentencepiece as spm
-
-from ...tokenization_utils import AddedToken, BatchEncoding, PreTrainedTokenizer
+from ...tokenization_python import BatchEncoding
+from ...tokenization_utils_base import AddedToken
+from ...tokenization_utils_sentencepiece import SentencePieceBackend
 from ...utils import logging
 from ...utils.import_utils import requires
 
@@ -48,7 +46,7 @@
 
 
 @requires(backends=("sentencepiece",))
-class PLBartTokenizer(PreTrainedTokenizer):
+class PLBartTokenizer(SentencePieceBackend):
     """
     Construct an PLBART tokenizer.
 
@@ -141,12 +139,7 @@ def __init__(
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         src_lang = self._convert_lang_code_special_format(src_lang)
         tgt_lang = self._convert_lang_code_special_format(tgt_lang)
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
         self.language_codes = language_codes
-
         fairseq_language_codes = FAIRSEQ_LANGUAGE_CODES[self.language_codes]
 
         # Original fairseq vocab and spm vocab must be "aligned":
@@ -156,40 +149,21 @@ def __init__(
         # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
 
         # Mimic fairseq token-to-id alignment for the first 4 token
+        self.vocab_file = vocab_file
+        self.lang_code_to_id = {}
+        self.id_to_lang_code = {}
         self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
-
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
-
-        self.sp_model_size = len(self.sp_model)
-        self.lang_code_to_id = {
-            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(fairseq_language_codes)
-        }
-        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
-
-        if self.language_codes == "base":
-            self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
-
-        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
-        _additional_special_tokens = list(self.lang_code_to_id.keys())
+        self.fairseq_offset = 1
+        _additional_special_tokens = list(fairseq_language_codes)
 
         if additional_special_tokens is not None:
-            # Only add those special tokens if they are not already there.
             _additional_special_tokens.extend(
                 [t for t in additional_special_tokens if t not in _additional_special_tokens]
             )
 
-        if self.language_codes == "base":
-            self._src_lang = src_lang
-            self.cur_lang_code_id = (
-                self.lang_code_to_id[self._src_lang] if self._src_lang is not None else self._src_lang
-            )
-        else:
-            self._src_lang = src_lang if src_lang is not None else "__en_XX__"
-            self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
-
         super().__init__(
+            vocab_file=vocab_file,
             bos_token=bos_token,
             eos_token=eos_token,
             unk_token=unk_token,
@@ -197,43 +171,89 @@ def __init__(
             cls_token=cls_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            language_codes=language_codes,
             tokenizer_file=tokenizer_file,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
             additional_special_tokens=_additional_special_tokens,
             sp_model_kwargs=self.sp_model_kwargs,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            language_codes=language_codes,
+            special_tokens_pattern="prefix_suffix",
+            token_type_ids_pattern="all_zeros",
             **kwargs,
         )
 
-        self.tgt_lang = tgt_lang
-        self.set_src_lang_special_tokens(self._src_lang)
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.sp_model_size = len(self.sp_model)
+        self.lang_code_to_id = {
+            code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(fairseq_language_codes)
+        }
+        self.id_to_lang_code = {v: k for k, v in self.lang_code_to_id.items()}
+        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
+        if self.language_codes == "base":
+            self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
 
-    def __setstate__(self, d):
-        self.__dict__ = d
+        self.fairseq_tokens_to_ids.update(self.lang_code_to_id)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        reserved_tokens = {"", "", "", "", ""}
+        reserved_tokens.update(FAIRSEQ_LANGUAGE_CODES[self.language_codes])
+
+        removed = False
+        for token in reserved_tokens:
+            idx = self._added_tokens_encoder.pop(token, None)
+            if idx is not None:
+                self._added_tokens_decoder.pop(idx, None)
+                removed = True
+        if removed:
+            self._update_trie()
+            self._update_total_vocab_size()
+
+        synced = False
+        for token, idx in self._added_tokens_encoder.items():
+            if idx in self._added_tokens_decoder:
+                continue
+            self._added_tokens_decoder[idx] = AddedToken(
+                token, special=True, normalized=False, lstrip=False, rstrip=False
+            )
+            synced = True
+        if synced:
+            self._update_trie()
+            self._update_total_vocab_size()
 
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
+        if self.language_codes == "base":
+            self._src_lang = src_lang
+            self.cur_lang_code_id = (
+                self.lang_code_to_id[self._src_lang] if self._src_lang is not None else self._src_lang
+            )
+        else:
+            self._src_lang = src_lang if src_lang is not None else "__en_XX__"
+            self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+        self.tgt_lang = tgt_lang
+        self.set_src_lang_special_tokens(self._src_lang)
 
     @property
     def vocab_size(self):
-        if self.language_codes == "base":
-            return (
-                len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1
-            )  # Plus 1 for the mask token
-        else:
-            return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset
+        lang_code_count = len(getattr(self, "lang_code_to_id", {}))
+        fairseq_offset = getattr(self, "fairseq_offset", 1)
+        base_vocab = len(self.sp_model) if hasattr(self, "sp_model") else 0
+        if getattr(self, "language_codes", "base") == "base":
+            return base_vocab + lang_code_count + fairseq_offset + 1  # +1 for mask token
+        return base_vocab + lang_code_count + fairseq_offset
+
+    def get_vocab(self):
+        """Override to use fairseq vocabulary structure"""
+        vocab = self.fairseq_tokens_to_ids.copy()
+        for i in range(self.sp_model.get_piece_size()):
+            sp_token = self.sp_model.IdToPiece(i)
+            # Map SP token to fairseq ID: SP ID 0 maps to unk_token_id, others map to SP_ID + fairseq_offset
+            vocab_id = self.unk_token_id if i == 0 else (i + self.fairseq_offset)
+            if sp_token not in vocab:
+                vocab[sp_token] = vocab_id
+        # Add any additional tokens
+        vocab.update({token: idx for token, idx in self._added_tokens_encoder.items() if token not in vocab})
+        return vocab
 
     @property
     def src_lang(self) -> str:
@@ -245,87 +265,6 @@ def src_lang(self, new_src_lang: str) -> None:
         self._src_lang = new_src_lang
         self.set_src_lang_special_tokens(self._src_lang)
 
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
     def _build_translation_inputs(
         self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
     ):
@@ -339,14 +278,6 @@ def _build_translation_inputs(
         inputs["forced_bos_token_id"] = tgt_lang_id
         return inputs
 
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         if token in self.fairseq_tokens_to_ids:
@@ -362,28 +293,6 @@ def _convert_id_to_token(self, index):
             return self.fairseq_ids_to_tokens[index]
         return self.sp_model.IdToPiece(index - self.fairseq_offset)
 
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
     def prepare_seq2seq_batch(
         self,
         src_texts: list[str],
@@ -428,5 +337,38 @@ def _convert_lang_code_special_format(self, lang: str) -> str:
         lang = FAIRSEQ_LANGUAGE_CODES_MAP.get(lang, lang)
         return lang
 
+    def clean_up_tokenization(self, out_string: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
+
+        Args:
+            out_string (`str`): The text to clean up.
+
+        Returns:
+            `str`: The cleaned-up string.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs):
+        """Override to use self.clean_up_tokenization_spaces as default for batched input."""
+        return super().decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=self.clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
 
 __all__ = ["PLBartTokenizer"]
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
index a68168e36739..bd3a2b8757e4 100644
--- a/src/transformers/models/pop2piano/processing_pop2piano.py
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -21,7 +21,7 @@
 
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessorMixin
-from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy
+from ...tokenization_python import BatchEncoding, PaddingStrategy, TruncationStrategy
 from ...utils import TensorType
 from ...utils.import_utils import requires
 
diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py
index fc6dc266bc12..9d317fccfd1c 100644
--- a/src/transformers/models/pop2piano/tokenization_pop2piano.py
+++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py
@@ -21,7 +21,7 @@
 import numpy as np
 
 from ...feature_extraction_utils import BatchFeature
-from ...tokenization_utils import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy
+from ...tokenization_python import AddedToken, BatchEncoding, PaddingStrategy, PreTrainedTokenizer, TruncationStrategy
 from ...utils import TensorType, is_pretty_midi_available, logging, requires_backends, to_numpy
 from ...utils.import_utils import requires
 
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 24401835c7fc..506006323583 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -19,7 +19,7 @@
 from collections.abc import Iterable
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_python import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...utils import logging
 
 
@@ -28,7 +28,6 @@
 VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
 
 
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a piece of text."""
     text = text.strip()
@@ -38,7 +37,6 @@ def whitespace_tokenize(text):
     return tokens
 
 
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
 class BasicTokenizer:
     """
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
@@ -200,7 +198,6 @@ def _clean_text(self, text):
         return "".join(output)
 
 
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
 class WordpieceTokenizer:
     """Runs WordPiece tokenization."""
 
diff --git a/src/transformers/models/qwen2/tokenization_qwen2.py b/src/transformers/models/qwen2/tokenization_qwen2.py
index be121adb5442..48312e3cadf8 100644
--- a/src/transformers/models/qwen2/tokenization_qwen2.py
+++ b/src/transformers/models/qwen2/tokenization_qwen2.py
@@ -14,15 +14,11 @@
 # limitations under the License.
 """Tokenization classes for Qwen2."""
 
-import json
-import os
-import unicodedata
-from functools import lru_cache
-from typing import Optional
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
 
-import regex as re
-
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import generate_merges
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
@@ -31,312 +27,86 @@
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
     "merges_file": "merges.txt",
+    "tokenizer_file": "tokenizer.json",
 }
 
-
 MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
 
 PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
 
 
-@lru_cache
-# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class Qwen2Tokenizer(PreTrainedTokenizer):
-    """
-    Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import Qwen2Tokenizer
-
-    >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
-    >>> tokenizer("Hello world")["input_ids"]
-    [9707, 1879]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [21927, 1879]
-    ```
-    This is expected.
-
-    You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token. Not applicable for this tokenizer.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
-            tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
-        split_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not the special tokens should be split during the tokenization process. The default behavior is
-            to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
-            ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
-            '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
-    """
-
+class Qwen2Tokenizer(TokenizersBackend):
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
-        errors="replace",
+        vocab_file=None,
+        merges_file=None,
         unk_token="<|endoftext|>",
         bos_token=None,
         eos_token="<|endoftext|>",
         pad_token="<|endoftext|>",
-        clean_up_tokenization_spaces=False,
-        split_special_tokens=False,
+        add_prefix_space=None,
+        vocab=None,
+        merges=None,
         **kwargs,
     ):
-        # Qwen vocab does not contain control tokens; added tokens need to be special
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        bpe_merges = []
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            for i, line in enumerate(merges_handle):
-                line = line.strip()
-                if (i == 0 and line.startswith("#version:")) or not line:
-                    continue
-                bpe_merges.append(tuple(line.split()))
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        # NOTE: the cache can grow without bound and will get really large for long running processes
-        # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
-        # not a memory leak but appears as one.
-        # GPT2Tokenizer has the same problem, so let's be consistent.
-        self.cache = {}
+        self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
 
-        self.pat = re.compile(PRETOKENIZE_REGEX)
-
-        if kwargs.get("add_prefix_space", False):
-            logger.warning_once(
-                f"{self.__class__.__name} does not support `add_prefix_space`, setting it to True has no effect."
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                "<|endoftext|>": 0,
+            }
+        self._merges = merges if merges is not None else generate_merges(self._vocab)
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                unk_token=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+                byte_fallback=False,
             )
+        )
+        self._tokenizer.decoder = decoders.ByteLevel()
+        self._tokenizer.normalizer = normalizers.NFC()
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(
+                    Regex(PRETOKENIZE_REGEX),
+                    behavior="isolated",
+                    invert=False,
+                ),
+                pre_tokenizers.ByteLevel(
+                    add_prefix_space=self.add_prefix_space,
+                    use_regex=False,
+                ),
+            ]
+        )
+        tokenizer_object = self._tokenizer
 
         super().__init__(
-            errors=errors,
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            tokenizer_object=tokenizer_object,
+            unk_token=unk_token,
             bos_token=bos_token,
             eos_token=eos_token,
             pad_token=pad_token,
-            unk_token=unk_token,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            split_special_tokens=split_special_tokens,
-            **kwargs,
-        )
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.get_vocab
-    def get_vocab(self):
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_id_to_token
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def decode(
-        self,
-        token_ids,
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = False,
-        spaces_between_special_tokens: bool = False,
-        **kwargs,
-    ) -> str:
-        # `spaces_between_special_tokens` defaults to True for _decode in slow tokenizers
-        # and cannot be configured elsewhere, but it should default to False for Qwen2Tokenizer
-        return super().decode(
-            token_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            spaces_between_special_tokens=spaces_between_special_tokens,
+            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def prepare_for_tokenization(self, text, **kwargs):
-        text = unicodedata.normalize("NFC", text)
-        return (text, kwargs)
+        self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
 
 
 __all__ = ["Qwen2Tokenizer"]
diff --git a/src/transformers/models/qwen2/tokenization_qwen2_fast.py b/src/transformers/models/qwen2/tokenization_qwen2_fast.py
deleted file mode 100644
index dda8123c7d6b..000000000000
--- a/src/transformers/models/qwen2/tokenization_qwen2_fast.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Qwen2."""
-
-from typing import Optional
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_qwen2 import Qwen2Tokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-    "tokenizer_file": "tokenizer.json",
-}
-
-
-MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
-
-
-class Qwen2TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
-    Byte-Pair-Encoding.
-
-    Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
-    be encoded differently whether it is at the beginning of the sentence (without space) or not:
-
-    ```python
-    >>> from transformers import Qwen2TokenizerFast
-
-    >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
-    >>> tokenizer("Hello world")["input_ids"]
-    [9707, 1879]
-
-    >>> tokenizer(" Hello world")["input_ids"]
-    [21927, 1879]
-    ```
-    This is expected.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead. Not applicable to this tokenizer.
-        bos_token (`str`, *optional*):
-            The beginning of sequence token. Not applicable for this tokenizer.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The token used for padding, for example when batching sequences of different lengths.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = Qwen2Tokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token=None,
-        eos_token="<|endoftext|>",
-        pad_token="<|endoftext|>",
-        **kwargs,
-    ):
-        # We need to at least pass vocab_file and merges_file to base class
-        # in case a slow tokenizer needs to be initialized; other can be
-        # configured through files.
-        # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
-
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
-
-        super().__init__(
-            vocab_file=vocab_file,
-            merges_file=merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["Qwen2TokenizerFast"]
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 49c782e5b3fa..ab66f34f9dfd 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import cached_file, is_datasets_available, is_faiss_available, logging, requires_backends, strtobool
 from .configuration_rag import RagConfig
@@ -528,7 +528,7 @@ def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
             for j in range(n_docs)
         ]
 
-        contextualized_inputs = self.generator_tokenizer.batch_encode_plus(
+        contextualized_inputs = self.generator_tokenizer(
             rag_input_strings,
             max_length=self.config.max_combined_length,
             return_tensors=return_tensors,
@@ -629,7 +629,7 @@ def __call__(
         prefix = prefix if prefix is not None else self.config.generator.prefix
         retrieved_doc_embeds, doc_ids, docs = self.retrieve(question_hidden_states, n_docs)
 
-        input_strings = self.question_encoder_tokenizer.batch_decode(question_input_ids, skip_special_tokens=True)
+        input_strings = self.question_encoder_tokenizer.decode(question_input_ids, skip_special_tokens=True)
         context_input_ids, context_attention_mask = self.postprocess_docs(
             docs, input_strings, prefix, n_docs, return_tensors=return_tensors
         )
diff --git a/src/transformers/models/reformer/__init__.py b/src/transformers/models/reformer/__init__.py
index cd6721b6ac23..0db4f927f140 100644
--- a/src/transformers/models/reformer/__init__.py
+++ b/src/transformers/models/reformer/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_reformer import *
     from .modeling_reformer import *
     from .tokenization_reformer import *
-    from .tokenization_reformer_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index 458b72df4ff6..021ab2a88074 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -14,15 +14,13 @@
 # limitations under the License.
 """Tokenization class for model Reformer."""
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers
+from tokenizers.models import BPE
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
@@ -33,13 +31,13 @@
 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
 
 
-@requires(backends=("sentencepiece",))
-class ReformerTokenizer(PreTrainedTokenizer):
+class ReformerTokenizer(TokenizersBackend):
     """
-    Construct a Reformer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) .
+    Construct a Reformer tokenizer (backed by HuggingFace's tokenizers library). Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=bpe#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
         vocab_file (`str`):
@@ -58,119 +56,79 @@ class ReformerTokenizer(PreTrainedTokenizer):
         unk_token (`str`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `[]`):
+        pad_token (`str`, *optional*, defaults to `""`):
+            The token used for padding, for example when batching sequences of different lengths.
+        additional_special_tokens (`list[str]`, *optional*):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from vocab_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        eos_token="",
-        unk_token="",
-        additional_special_tokens=[],
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        vocab_file: Optional[str] = None,
+        eos_token: str = "",
+        unk_token: str = "",
+        additional_special_tokens: Optional[list] = None,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
-    ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
+    ):
         self.vocab_file = vocab_file
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+
+        if vocab is not None:
+            self._vocab = vocab
+        else:
+            self._vocab = {}
+
+        if merges is not None:
+            # Convert lists to tuples if necessary (happens when loading from JSON)
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                unk_token=str(unk_token),
+                fuse_unk=True,
+                byte_fallback=False,
+                dropout=None,
+            )
+        )
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace("\n", " "),
+                normalizers.Replace("\r", " "),
+                normalizers.Replace("\t", " "),
+                normalizers.Replace(Regex(r" {2,}"), " "),
+                normalizers.NFC(),
+                normalizers.Strip(left=False, right=True),
+            ]
+        )
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always")
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always")
+
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             eos_token=eos_token,
             unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
+            additional_special_tokens=additional_special_tokens or [],
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self) -> dict[str, int]:
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index < self.sp_model.get_piece_size():
-            token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+        super()._post_init()
 
 
 __all__ = ["ReformerTokenizer"]
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
deleted file mode 100644
index d68528de5872..000000000000
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The Trax Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for model Reformer."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_reformer import ReformerTokenizer
-else:
-    ReformerTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-
-SPIECE_UNDERLINE = "▁"
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-class ReformerTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Reformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (`list[str]`, *optional*):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = ReformerTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        eos_token="",
-        unk_token="",
-        additional_special_tokens=[],
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["ReformerTokenizerFast"]
diff --git a/src/transformers/models/rembert/__init__.py b/src/transformers/models/rembert/__init__.py
index 23b308c7f13d..371d23ef13a7 100644
--- a/src/transformers/models/rembert/__init__.py
+++ b/src/transformers/models/rembert/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_rembert import *
     from .modeling_rembert import *
     from .tokenization_rembert import *
-    from .tokenization_rembert_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index cf27a7b3bae6..205356678a76 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,36 +12,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for RemBERT."""
+"""Tokenization classes for RemBert model."""
 
-import os
-from shutil import copyfile
 from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model"}
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model", "tokenizer_file": "tokenizer.json"}
 
 
-@requires(backends=("sentencepiece",))
-class RemBertTokenizer(PreTrainedTokenizer):
+class RemBertTokenizer(TokenizersBackend):
     """
-    Construct a RemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
+    tokenizer inherits from [`AlbertTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods
 
     Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        remove_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
+        keep_accents (`bool`, *optional*, defaults to `False`):
+            Whether or not to keep accents when tokenizing.
         bos_token (`str`, *optional*, defaults to `"[CLS]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
@@ -53,15 +53,8 @@ class RemBertTokenizer(PreTrainedTokenizer):
             
 
         eos_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
+            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
+            that is used for the end of sequence. The token used is the `sep_token`.
         unk_token (`str`, *optional*, defaults to `""`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -77,161 +70,119 @@ class RemBertTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        do_lower_case=False,
-        remove_space=True,
-        keep_accents=True,
-        bos_token="[CLS]",
-        eos_token="[SEP]",
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
+        vocab_file: Optional[str] = None,
+        do_lower_case: bool = False,
+        keep_accents: bool = False,
+        bos_token: str = "[CLS]",
+        eos_token: str = "[SEP]",
+        unk_token: str = "",
+        sep_token: str = "[SEP]",
+        pad_token: str = "",
+        cls_token: str = "[CLS]",
+        mask_token: str = "[MASK]",
+        add_prefix_space: bool = True,
+        remove_space: bool = True,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        self.do_lower_case = do_lower_case
+        self.vocab_file = vocab_file
         self.remove_space = remove_space
+        self.do_lower_case = do_lower_case
         self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
 
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(vocab_file)
+        if vocab is not None:
+            self._vocab_scores = vocab
+        else:
+            self._vocab_scores = [
+                (str(pad_token), 0.0),
+                (str(unk_token), 0.0),
+                (str(cls_token), 0.0),
+                (str(sep_token), 0.0),
+                (str(mask_token), 0.0),
+            ]
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab_scores,
+                unk_id=2,
+                byte_fallback=False,
+            )
+        )
+
+        # Build normalizer matching RemBertConverter behavior
+        # When loading from pretrained, this will be overridden by tokenizer.json config
+        # When creating from extractor (vocab), this provides equivalent behavior
+        list_normalizers = [
+            normalizers.Replace("``", '"'),
+            normalizers.Replace("''", '"'),
+            normalizers.Replace(Regex(" {2,}"), " "),
+        ]
+        if not self.keep_accents:
+            list_normalizers.append(normalizers.NFKD())
+            list_normalizers.append(normalizers.StripAccents())
+        if self.do_lower_case:
+            list_normalizers.append(normalizers.Lowercase())
+
+        # Add Precompiled equivalent (newline conversion + NFKC normalization)
+        list_normalizers.extend(
+            [
+                normalizers.Replace(Regex(r"[\n\r\t]"), " "),  # Precompiled converts newlines/tabs to spaces
+                normalizers.NFKC(),  # Precompiled does NFKC normalization
+            ]
+        )
+
+        self._tokenizer.normalizer = normalizers.Sequence(list_normalizers)
+
+        prepend_scheme = "always" if add_prefix_space else "never"
+        # Remove WhitespaceSplit - should only have Metaspace (matches SpmConverter)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+
+        tokenizer_object = self._tokenizer
+
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            add_prefix_space=add_prefix_space,
             do_lower_case=do_lower_case,
-            remove_space=remove_space,
             keep_accents=keep_accents,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
-            pad_token=pad_token,
             cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
             mask_token=mask_token,
+            remove_space=remove_space,
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-        self.sp_model = spm.SentencePieceProcessor()
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text, sample=False):
-        """Tokenize a string."""
-        pieces = self.sp_model.EncodeAsPieces(text)
-        return pieces
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    def convert_tokens_to_string(self, tokens):
-        out_string = self.sp_model.decode_pieces(tokens)
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A REMBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        # Set post_processor after super().__init__() so we have token IDs available
+        # This matches RemBertConverter.post_processor()
+        cls_token_str = str(cls_token)
+        sep_token_str = str(sep_token)
+        cls_token_id = self.convert_tokens_to_ids(cls_token_str)
+        sep_token_id = self.convert_tokens_to_ids(sep_token_str)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls_token_str}:0 $A:0 {sep_token_str}:0",
+            pair=f"{cls_token_str}:0 $A:0 {sep_token_str}:0 $B:1 {sep_token_str}:1",
+            special_tokens=[
+                (cls_token_str, cls_token_id),
+                (sep_token_str, sep_token_id),
+            ],
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+        super()._post_init()
 
 
 __all__ = ["RemBertTokenizer"]
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
deleted file mode 100644
index fb358746e6d2..000000000000
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for RemBERT model."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_rembert import RemBertTokenizer
-else:
-    RemBertTokenizer = None
-
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-SPIECE_UNDERLINE = "▁"
-
-
-class RemBertTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
-    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        remove_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (`bool`, *optional*, defaults to `False`):
-            Whether or not to keep accents when tokenizing.
-        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            that is used for the end of sequence. The token used is the `sep_token`.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = RemBertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        remove_space=True,
-        keep_accents=False,
-        bos_token="[CLS]",
-        eos_token="[SEP]",
-        unk_token="",
-        sep_token="[SEP]",
-        pad_token="",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A RemBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return cls + token_ids_0 + sep
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of ids.
-            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Set to True if the token list is already formatted with special tokens for the model
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formatted with special tokens for the model."
-                )
-            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["RemBertTokenizerFast"]
diff --git a/src/transformers/models/roberta/__init__.py b/src/transformers/models/roberta/__init__.py
index a82d4c9bc617..2e8011675176 100644
--- a/src/transformers/models/roberta/__init__.py
+++ b/src/transformers/models/roberta/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_roberta import *
     from .modeling_roberta import *
     from .tokenization_roberta import *
-    from .tokenization_roberta_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 67cdcbbf488a..66eefc3a0caa 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -14,67 +14,23 @@
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
 
-import json
-import os
-from functools import lru_cache
 from typing import Optional
 
-import regex as re
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
 
 
-@lru_cache
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class RobertaTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
+class RobertaTokenizer(TokenizersBackend):
+    r"""
+    Construct a RoBERTa tokenizer (backed by HuggingFace's tokenizers library). Based on Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
@@ -95,18 +51,14 @@ class RobertaTokenizer(PreTrainedTokenizer):
 
     
 
-    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
 
     
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should refer to
     this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        merges_file (`str`):
-            Path to the merges file.
         errors (`str`, *optional*, defaults to `"replace"`):
             Paradigm to follow when decoding bytes to UTF-8. See
             [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
@@ -148,255 +100,91 @@ class RobertaTokenizer(PreTrainedTokenizer):
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether the post processing step should trim offsets to avoid including whitespaces.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, vocabulary is loaded from vocab_file.
+        merges (`list`, *optional*):
+            Custom merges list. If not provided, merges are loaded from merges_file.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
-        errors="replace",
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        add_prefix_space=False,
+        errors: str = "replace",
+        bos_token: str = "",
+        eos_token: str = "",
+        sep_token: str = "",
+        cls_token: str = "",
+        unk_token: str = "",
+        pad_token: str = "",
+        mask_token: str = "",
+        add_prefix_space: bool = False,
+        trim_offsets: bool = True,
+        vocab: Optional[dict] = None,
+        merges: Optional[list] = None,
         **kwargs,
     ):
-        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
-        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
-
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False)
-            if isinstance(mask_token, str)
-            else mask_token
+        self.add_prefix_space = add_prefix_space
+        self.trim_offsets = trim_offsets
+
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
+            )
+        else:
+            self._vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+            }
+
+        if merges is not None:
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+        else:
+            self._merges = []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
         )
 
-        # these special tokens are not part of the vocab.json, let's add them in the correct order
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+        self._tokenizer.post_processor = processors.RobertaProcessing(
+            sep=(str(sep_token), self._vocab.get(str(sep_token), 3)),
+            cls=(str(cls_token), self._vocab.get(str(cls_token), 2)),
+            add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
+        )
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             errors=errors,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
             add_prefix_space=add_prefix_space,
+            trim_offsets=trim_offsets,
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-
-    def get_vocab(self):
-        vocab = dict(self.encoder).copy()
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
-            text = " " + text
-        return (text, kwargs)
-
 
 __all__ = ["RobertaTokenizer"]
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_old.py
similarity index 99%
rename from src/transformers/models/roberta/tokenization_roberta_fast.py
rename to src/transformers/models/roberta/tokenization_roberta_old.py
index d9ddcfc82d49..782c1a0311f2 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_old.py
@@ -20,7 +20,7 @@
 from tokenizers import processors
 
 from ...tokenization_utils_base import AddedToken, BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
 from ...utils import logging
 from .tokenization_roberta import RobertaTokenizer
 
diff --git a/src/transformers/models/roc_bert/tokenization_roc_bert.py b/src/transformers/models/roc_bert/tokenization_roc_bert.py
index afe4e2992003..d6099896150d 100644
--- a/src/transformers/models/roc_bert/tokenization_roc_bert.py
+++ b/src/transformers/models/roc_bert/tokenization_roc_bert.py
@@ -21,7 +21,7 @@
 import unicodedata
 from typing import Optional, Union
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_python import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
@@ -48,7 +48,6 @@
 }
 
 
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
@@ -60,7 +59,6 @@ def load_vocab(vocab_file):
     return vocab
 
 
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a piece of text."""
     text = text.strip()
@@ -171,6 +169,243 @@ def __init__(
             **kwargs,
         )
 
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, list[TextInput]],
+        text_pair: Optional[Union[TextInput, list[TextInput]]] = None,
+        text_target: Optional[Union[TextInput, list[TextInput]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Handle text_target for seq2seq tasks
+        if text_target is not None:
+            # Tokenize source text
+            encodings = self.__call__(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+            # Tokenize target text
+            target_length = max_target_length if max_target_length is not None else max_length
+            target_encodings = self.__call__(
+                text=text_target,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation if target_length is not None else False,
+                max_length=target_length,
+                stride=0,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=False,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=False,
+                return_special_tokens_mask=False,
+                return_offsets_mapping=False,
+                return_length=False,
+                verbose=verbose,
+                **kwargs,
+            )
+            # Add labels from target input_ids
+            encodings["labels"] = target_encodings["input_ids"]
+            return encodings
+
+        # Detect batch vs single
+        is_batched = isinstance(text, (list, tuple)) and (
+            not is_split_into_words or (len(text) > 0 and isinstance(text[0], (list, tuple)))
+        )
+
+        if is_batched:
+            # Build batch tuples of (text, text_pair) if provided
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,  # type: ignore[arg-type]
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+            list[PreTokenizedInputPair],
+            list[EncodedInput],
+            list[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
     @property
     def do_lower_case(self):
         return self.basic_tokenizer.do_lower_case
@@ -179,11 +414,9 @@ def do_lower_case(self):
     def vocab_size(self):
         return len(self.vocab)
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_vocab
     def get_vocab(self):
         return dict(self.vocab, **self.added_tokens_encoder)
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._tokenize
     def _tokenize(self, text, split_special_tokens=False):
         split_tokens = []
         if self.do_basic_tokenize:
@@ -418,9 +651,9 @@ def prepare_for_model(
                 stride=stride,
             )
 
-        if return_overflowing_tokens:
+        if return_overflowing_tokens and not return_tensors and overflowing_tokens:
             encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
 
         # Add special tokens
         if add_special_tokens:
@@ -644,7 +877,7 @@ def get_input_ids(text):
             verbose=verbose,
         )
 
-        return BatchEncoding(batch_outputs)
+        return batch_outputs
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def _batch_prepare_for_model(
@@ -720,11 +953,16 @@ def _batch_prepare_for_model(
             return_attention_mask=return_attention_mask,
         )
 
+        # Remove overflow-related keys before tensor conversion if return_tensors is set
+        # Slow tokenizers don't support returning these as tensors
+        if return_tensors and return_overflowing_tokens:
+            batch_outputs.pop("overflowing_tokens", None)
+            batch_outputs.pop("num_truncated_tokens", None)
+
         batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
 
         return batch_outputs
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.vocab.get(token, self.vocab.get(self.unk_token))
@@ -755,12 +993,10 @@ def convert_tokens_to_pronunciation_ids(self, tokens: Union[str, list[str]]) ->
             ids.append(self._convert_token_to_pronunciation_id(token))
         return ids
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         return self.ids_to_tokens.get(index, self.unk_token)
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.convert_tokens_to_string
     def convert_tokens_to_string(self, tokens):
         """Converts a sequence of tokens (string) in a single string."""
         out_string = " ".join(tokens).replace(" ##", "").strip()
@@ -795,7 +1031,6 @@ def build_inputs_with_special_tokens(
             return cls + token_ids_0 + sep
         return cls + token_ids_0 + sep + token_ids_1 + sep
 
-    # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
     ) -> list[int]:
@@ -869,7 +1104,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         )
 
 
-# Copied from  transformers.models.bert.tokenization_bert.BasicTokenizer with BasicTokenizer->RoCBertBasicTokenizer
 class RoCBertBasicTokenizer:
     """
     Constructs a RoCBertBasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
@@ -1031,7 +1265,6 @@ def _clean_text(self, text):
         return "".join(output)
 
 
-# Copied from  transformers.models.bert.tokenization_bert.WordpieceTokenizer with WordpieceTokenizer->RoCBertWordpieceTokenizer
 class RoCBertWordpieceTokenizer:
     """Runs WordPiece tokenization."""
 
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index cf10a1b528a9..af44f4abea5e 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -19,7 +19,7 @@
 import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_python import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
 from ...utils import logging
 
 
@@ -28,7 +28,6 @@
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
@@ -40,7 +39,6 @@ def load_vocab(vocab_file):
     return vocab
 
 
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
 def whitespace_tokenize(text):
     """Runs basic whitespace cleaning and splitting on a piece of text."""
     text = text.strip()
@@ -50,7 +48,6 @@ def whitespace_tokenize(text):
     return tokens
 
 
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
 class BasicTokenizer:
     """
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
@@ -212,7 +209,6 @@ def _clean_text(self, text):
         return "".join(output)
 
 
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
 class WordpieceTokenizer:
     """Runs WordPiece tokenization."""
 
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index c9704aa7190d..fb91a325419b 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -20,7 +20,7 @@
 from tokenizers import normalizers
 from tokenizers.pre_tokenizers import BertPreTokenizer, PreTokenizer
 
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...tokenization_utils_tokenizers import PreTrainedTokenizerFast
 from ...utils import logging
 from .tokenization_roformer import RoFormerTokenizer
 from .tokenization_utils import JiebaPreTokenizer
@@ -83,20 +83,26 @@ def __init__(
         )
 
         normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        # Make sure we correctly set the custom PreTokenizer
+        normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+        normalizer_state["lowercase"] = do_lower_case
+        normalizer_state["strip_accents"] = strip_accents
+        self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
         vocab = self.backend_tokenizer.get_vocab()
         self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
 
         self.do_lower_case = do_lower_case
+        self.strip_accents = strip_accents
+
+    def _post_init(self):
+        super()._post_init()
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+        normalizer_state["lowercase"] = self.do_lower_case
+        normalizer_state["strip_accents"] = getattr(self, "strip_accents", None)
+        self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+        vocab = self.backend_tokenizer.get_vocab()
+        self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -145,7 +151,10 @@ def save_pretrained(
         **kwargs,
     ):
         self.backend_tokenizer.pre_tokenizer = BertPreTokenizer()
-        return super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
+        result = super().save_pretrained(save_directory, legacy_format, filename_prefix, push_to_hub, **kwargs)
+        vocab = self.backend_tokenizer.get_vocab()
+        self.backend_tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer(vocab))
+        return result
 
 
 __all__ = ["RoFormerTokenizerFast"]
diff --git a/src/transformers/models/seamless_m4t/__init__.py b/src/transformers/models/seamless_m4t/__init__.py
index 0d289de0474a..27c08de501c1 100644
--- a/src/transformers/models/seamless_m4t/__init__.py
+++ b/src/transformers/models/seamless_m4t/__init__.py
@@ -23,7 +23,6 @@
     from .modeling_seamless_m4t import *
     from .processing_seamless_m4t import *
     from .tokenization_seamless_m4t import *
-    from .tokenization_seamless_m4t_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
index 403922eee93c..5388ec2b95d8 100644
--- a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
@@ -12,42 +12,34 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for SeamlessM4T."""
+"""Tokenization class for SeamlessM4T."""
 
-import os
-from shutil import copyfile
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...convert_slow_tokenizer import import_protobuf
-from ...tokenization_utils import (
+from ...tokenization_python import (
     BatchEncoding,
     PreTokenizedInput,
-    PreTrainedTokenizer,
     TextInput,
 )
-from ...tokenization_utils_base import AddedToken
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import PaddingStrategy, logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
-SPIECE_UNDERLINE = "▁"
 
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
-
-
-@requires(backends=("sentencepiece",))
-class SeamlessM4TTokenizer(PreTrainedTokenizer):
+class SeamlessM4TTokenizer(TokenizersBackend):
     """
-    Construct a SeamlessM4T tokenizer.
+    Construct a SeamlessM4T tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=bpe#models).
 
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     The tokenization method is `  ` for source language documents, and `   ` for target language documents.
@@ -66,8 +58,10 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer):
     ```
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
+        vocab (`list` or `dict`, *optional*):
+            List of (token, score) tuples or dict mapping tokens to indices. If not provided, uses default vocab.
+        merges (`list`, *optional*):
+            List of merge rules for BPE model. If not provided, uses empty list.
         bos_token (`str`, *optional*, defaults to `""`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
@@ -100,190 +94,158 @@ class SeamlessM4TTokenizer(PreTrainedTokenizer):
             token instead.
         pad_token (`str`, *optional*, defaults to `""`):
             The token used for padding, for example when batching sequences of different lengths.
-        tokenizer_file (`str`, *optional*):
-            The path to a tokenizer file to use instead of the vocab file.
         src_lang (`str`, *optional*, defaults to `"eng"`):
             The language to use as source language for translation.
         tgt_lang (`str`, *optional*, defaults to `"fra"`):
             The language to use as target language for translation.
-        sp_model_kwargs (`dict[str, Any]`, *optional*):
-            Additional keyword arguments to pass to the model initialization.
         additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional special tokens. Can be used to specify the list of languages that will be
-            supported by the tokenizer.
-        add_prefix_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word.
+            A tuple or a list of additional special tokens.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     prefix_tokens: list[int] = []
     suffix_tokens: list[int] = []
 
     def __init__(
         self,
-        vocab_file,
+        vocab: Optional[list] = None,
+        merges: Optional[list] = None,
         bos_token="",
         eos_token="",
         sep_token="",
         cls_token="",
         unk_token="",
         pad_token="",
-        tokenizer_file=None,
         src_lang="eng",
         tgt_lang="fra",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
         additional_special_tokens=None,
-        add_prefix_space=True,
+        keep_accents=None,
+        vocab_file=None,
         **kwargs,
     ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        # Add this unused argument to keep some important Copied from statements
-        self.legacy = False
-        self.vocab_file = vocab_file
-
-        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
-
-        # Vocab    |    0    |    1    |   2    |    3    |  4   |  5   |  6   |   7  |   8  |  9
-        # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ----
-        # spm  | ''   | '' | '' | 'an' | 'en' | '_d' | 'er' | 'in' | '_s' | '_a'
-        # fairseq  | ''   | '' | '' | '' | 'an' | 'en' | '▁d' | 'er' | 'in' | '▁s'
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self._added_tokens_decoder = {
-            0: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
-            1: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
-            2: AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token,
-            3: AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token,
-        }
+        if vocab is None:
+            vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(bos_token): 2,
+                str(eos_token): 3,
+            }
+
+        # Process vocab - SeamlessM4T uses fairseq vocab alignment: =0, =1, =2, =3, then SPM pieces[3:]
+        if isinstance(vocab, list):
+            # Convert list of (token, score) tuples to dict {token: idx}
+            # Check if vocab is already in SeamlessM4T order (pad, unk, s, /s) or tokenizer.json order (unk, s, /s, ...)
+            first_tokens = [str(item[0]) if isinstance(item, (list, tuple)) else str(item) for item in vocab[:4]]
+            is_seamless_order = (
+                len(first_tokens) >= 4
+                and first_tokens[0] == str(pad_token)
+                and first_tokens[1] == str(unk_token)
+                and first_tokens[2] == str(bos_token)
+                and first_tokens[3] == str(eos_token)
+            )
 
-        # The first "real" token "an" has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
+            if is_seamless_order:
+                # Already in correct order, use list index directly as token ID
+                vocab_dict = {}
+                for idx, item in enumerate(vocab):
+                    token = str(item[0]) if isinstance(item, (list, tuple)) else str(item)
+                    vocab_dict[token] = idx
+                self._vocab = vocab_dict
+            else:
+                # Reorder to fairseq: , , , , ... (rest of vocab)
+                vocab_dict = {}
+                vocab_dict[str(pad_token)] = 0
+                vocab_dict[str(unk_token)] = 1
+                vocab_dict[str(bos_token)] = 2
+                vocab_dict[str(eos_token)] = 3
+
+                # Add rest of vocab starting from index 4, skipping tokens we already added
+                idx = 4
+                for item in vocab:
+                    token = str(item[0]) if isinstance(item, (list, tuple)) else str(item)
+                    if token not in vocab_dict:
+                        vocab_dict[token] = idx
+                        idx += 1
+
+                self._vocab = vocab_dict
+        else:
+            self._vocab = vocab
+
+        if merges is None:
+            self._merges = []
+        else:
+            self._merges = [tuple(merge) if isinstance(merge, list) else merge for merge in merges]
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                unk_token=str(unk_token),
+                fuse_unk=True,
+                byte_fallback=False,
+            )
+        )
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace(Regex(r"[\n\r\t]"), " "),
+                normalizers.NFKC(),
+                normalizers.Strip(left=False, right=True),
+                normalizers.Replace(Regex(r" +▁"), "▁"),
+                normalizers.Replace(Regex(r"^▁+$"), ""),
+                normalizers.Replace(Regex(r" {2,}"), "▁"),
+            ]
+        )
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="first", split=True)
 
-        self.sp_model_size = len(self.sp_model)
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="first", split=True)
 
-        self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
-        self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
-        self.add_prefix_space = add_prefix_space
+        if "__" not in src_lang:
+            src_lang = f"__{src_lang}__"
+        if "__" not in tgt_lang:
+            tgt_lang = f"__{tgt_lang}__"
+
+        # V5: Convert additional_special_tokens parameter to extra_special_tokens for backward compatibility
+        # PreTrainedTokenizerBase.__init__() will handle the conversion, but we need to pass it via kwargs
+        if additional_special_tokens is not None:
+            kwargs.setdefault("additional_special_tokens", additional_special_tokens)
 
         super().__init__(
+            tokenizer_object=self._tokenizer,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
-            tokenizer_file=tokenizer_file,
             src_lang=src_lang,
             tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            add_prefix_space=add_prefix_space,
+            keep_accents=keep_accents,
+            vocab_file=vocab_file,
             **kwargs,
         )
 
-        self.set_src_lang_special_tokens(self._src_lang)
-        self.set_tgt_lang_special_tokens(self._tgt_lang)
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__getstate__
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__setstate__
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def __call__(
-        self,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_pair_target: Optional[
-            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
-        ] = None,
-        padding: Union[bool, str, PaddingStrategy] = True,
-        pad_to_multiple_of: Optional[int] = 2,
-        src_lang: Optional[str] = None,
-        tgt_lang: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                 index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            src_lang (`str`, *optional*):
-                A string representing the source language. If not specified, the last `src_lang` specified (either
-                during initialization or when calling this tokenizer) will be used.
-            tgt_lang (`str`, *optional*):
-                A string representing the target language. If not specified, the last `tgt_lang` specified (either
-                during initialization or when calling this tokenizer) will be used.
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizer.__call__`].
-        """
-        if src_lang is not None:
-            self.src_lang = src_lang
-        if tgt_lang is not None:
-            self.tgt_lang = tgt_lang
+        # Build fairseq mappings
+        self.fairseq_offset = 1
+        self.fairseq_tokens_to_ids = {
+            "": 0,
+            "": 1,
+            "": 2,
+            "": 3,
+        }
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
-        output = super().__call__(
-            text=text,
-            text_pair=text_pair,
-            text_target=text_target,
-            text_pair_target=text_pair_target,
-            padding=padding,
-            pad_to_multiple_of=pad_to_multiple_of,
-            **kwargs,
-        )
+        self._src_lang = src_lang
+        self._tgt_lang = tgt_lang
 
-        return BatchEncoding(output, tensor_type=kwargs.get("return_tensors"))
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
 
     @property
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
     def src_lang(self) -> str:
         return self._src_lang
 
@@ -307,97 +269,12 @@ def tgt_lang(self, new_tgt_lang: str) -> None:
             self._tgt_lang = new_tgt_lang
         self.set_tgt_lang_special_tokens(self._tgt_lang)
 
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1] * len(self.suffix_tokens)
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `X [eos, src_lang_code]`
-        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.create_token_type_ids_from_sequences
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
     def _build_translation_inputs(
         self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
     ):
         """Used by translation pipeline, to prepare inputs for the generate function"""
         if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model.")
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
         self.src_lang = src_lang
         inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
         if "__" not in tgt_lang:
@@ -406,129 +283,61 @@ def _build_translation_inputs(
         inputs["forced_bos_token_id"] = tgt_lang_id
         return inputs
 
-    def get_vocab(self):
-        vocab = {
-            self.convert_ids_to_tokens(i): i for i in range(self.fairseq_offset, self.vocab_size + self.fairseq_offset)
-        }
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    @property
-    def unk_token_length(self):
-        return len(self.sp_model.encode(str(self.unk_token)))
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
-    def get_spm_processor(self, from_slow=False):
-        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        if self.legacy or from_slow:  # no dependency on protobuf
-            tokenizer.Load(self.vocab_file)
-            return tokenizer
-
-        with open(self.vocab_file, "rb") as f:
-            sp_model = f.read()
-            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
-            model = model_pb2.ModelProto.FromString(sp_model)
-            normalizer_spec = model_pb2.NormalizerSpec()
-            normalizer_spec.add_dummy_prefix = False
-            model.normalizer_spec.MergeFrom(normalizer_spec)
-            sp_model = model.SerializeToString()
-            tokenizer.LoadFromSerializedProto(sp_model)
-        return tokenizer
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
-        first token is special.
-        """
-        if self.legacy or len(text) == 0:
-            return super().tokenize(text, **kwargs)
-
-        text = text.replace(SPIECE_UNDERLINE, " ")
-        if self.add_prefix_space:
-            text = SPIECE_UNDERLINE + text
-
-        tokens = super().tokenize(text, **kwargs)
-
-        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
-            tokens = tokens[1:]
-        return tokens
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
-    def _tokenize(self, text, **kwargs):
-        """
-        Returns a tokenized string.
-
-        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
-        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
-        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
-        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
-        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
-        """
-        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
-            return self.sp_model.encode(text, out_type=str)
-
-        # 1. Encode string + prefix ex: " Hey"
-        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
-        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        # since we manually add the prefix space, we have to remove it when decoding
-        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
-            tokens[0] = tokens[0][1:]
-
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.prepare_seq2seq_batch with eng_Latn->eng, fra_Latn->fra
     def prepare_seq2seq_batch(
         self,
         src_texts: list[str],
         src_lang: str = "eng",
         tgt_texts: Optional[list[str]] = None,
         tgt_lang: str = "fra",
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: Optional[str] = None,
+        truncation: bool = True,
         **kwargs,
     ) -> BatchEncoding:
         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_input_mode
+        if max_length is None:
+            max_length = self.model_max_length
+
+        model_inputs = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+
+        if tgt_texts is None:
+            return model_inputs
+
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+
+        self._switch_to_target_mode()
+        labels = self(
+            tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )
+        model_inputs["labels"] = labels["input_ids"]
+
+        self._switch_to_input_mode()
+
+        return model_inputs
+
     def _switch_to_input_mode(self):
         return self.set_src_lang_special_tokens(self.src_lang)
 
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_target_mode
     def _switch_to_target_mode(self):
         return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
@@ -537,31 +346,119 @@ def set_src_lang_special_tokens(self, src_lang) -> None:
         Prefix=[src_lang_code], suffix = [eos]
         """
         self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-        self.init_kwargs["src_lang"] = src_lang
 
         if self.cur_lang_code == self.unk_token_id:
             logger.warning_once(
-                f"`src_lang={src_lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+                f"`src_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
             )
 
         self.prefix_tokens = [self.cur_lang_code]
         self.suffix_tokens = [self.eos_token_id]
 
-    # https://github.com/facebookresearch/fairseq2/blob/c53f18e6be6b8b46b722f2249b8397b7eccd7ad3/src/fairseq2/models/nllb/tokenizer.py#L112-L116
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
     def set_tgt_lang_special_tokens(self, lang: str) -> None:
         """Reset the special tokens to the target lang setting.
         Prefix=[eos, tgt_lang_code] and suffix=[eos].
         """
         self.cur_lang_code = self.convert_tokens_to_ids(lang)
-        self.init_kwargs["tgt_lang"] = lang
 
         if self.cur_lang_code == self.unk_token_id:
             logger.warning_once(
-                f"`tgt_lang={lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+                f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
             )
 
         self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
         self.suffix_tokens = [self.eos_token_id]
 
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+        ] = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        pad_to_multiple_of: Optional[int] = None,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            pad_to_multiple_of (`int`, *optional*, defaults to `None`):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            src_lang (`str`, *optional*):
+                A string representing the source language. If not specified, the last `src_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            tgt_lang (`str`, *optional*):
+                A string representing the target language. If not specified, the last `tgt_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`TokenizersBackend.__call__`].
+        """
+        if src_lang is not None:
+            self.src_lang = src_lang
+        if tgt_lang is not None:
+            self.tgt_lang = tgt_lang
+
+        output = super().__call__(
+            text=text,
+            text_pair=text_pair,
+            text_target=text_target,
+            text_pair_target=text_pair_target,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+
+        return output
+
 
 __all__ = ["SeamlessM4TTokenizer"]
diff --git a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py b/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
deleted file mode 100644
index 081dcec7dd99..000000000000
--- a/src/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
+++ /dev/null
@@ -1,446 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization class for SeamlessM4T."""
-
-import os
-from shutil import copyfile
-from typing import Optional, Union
-
-from tokenizers import processors
-
-from ...tokenization_utils import (
-    BatchEncoding,
-    PreTokenizedInput,
-    TextInput,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_seamless_m4t import SeamlessM4TTokenizer
-else:
-    SeamlessM4TTokenizer = None
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-class SeamlessM4TTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" SeamlessM4T tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    The tokenization method is `  ` for source language documents, and `   ` for target language documents.
-
-    Examples:
-
-    ```python
-    >>> from transformers import SeamlessM4TTokenizerFast
-
-    >>> tokenizer = SeamlessM4TTokenizerFast.from_pretrained(
-    ...     "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
-    ... )
-    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
-    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
-    ```
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        tokenizer_file (`str`, *optional*):
-            The path to a tokenizer file to use instead of the vocab file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        src_lang (`str`, *optional*, defaults to `"eng"`):
-            The language to use as source language for translation.
-        tgt_lang (`str`, *optional*, defaults to `"fra"`):
-            The language to use as target language for translation.
-        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional special tokens.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = SeamlessM4TTokenizer
-    model_input_names = ["input_ids", "attention_mask"]
-
-    prefix_tokens: list[int] = []
-    suffix_tokens: list[int] = []
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        src_lang="eng",
-        tgt_lang="fra",
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-        self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
-        self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-        self.set_tgt_lang_special_tokens(self._tgt_lang)
-
-    @property
-    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
-    def src_lang(self) -> str:
-        return self._src_lang
-
-    @src_lang.setter
-    def src_lang(self, new_src_lang: str) -> None:
-        if "__" not in new_src_lang:
-            self._src_lang = f"__{new_src_lang}__"
-        else:
-            self._src_lang = new_src_lang
-        self.set_src_lang_special_tokens(self._src_lang)
-
-    @property
-    def tgt_lang(self) -> str:
-        return self._tgt_lang
-
-    @tgt_lang.setter
-    def tgt_lang(self, new_tgt_lang: str) -> None:
-        if "__" not in new_tgt_lang:
-            self._tgt_lang = f"__{new_tgt_lang}__"
-        else:
-            self._tgt_lang = new_tgt_lang
-        self.set_tgt_lang_special_tokens(self._tgt_lang)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. The special tokens depend on calling set_lang.
-
-        An SeamlessM4T sequence has the following format, where `X` represents the sequence:
-
-        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
-        - `decoder_input_ids`: (for decoder) `[eos, tgt_lang_code] X [eos]`
-
-        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
-        separator.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
-
-    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.create_token_type_ids_from_sequences
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
-        make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def _build_translation_inputs(
-        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
-    ):
-        """Used by translation pipeline, to prepare inputs for the generate function"""
-        if src_lang is None or tgt_lang is None:
-            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
-        self.src_lang = src_lang
-        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
-        if "__" not in tgt_lang:
-            tgt_lang = f"__{tgt_lang}__"
-        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
-        inputs["forced_bos_token_id"] = tgt_lang_id
-        return inputs
-
-    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.prepare_seq2seq_batch with "fra_Latn"->"fra", "eng_Latn"->"eng"
-    def prepare_seq2seq_batch(
-        self,
-        src_texts: list[str],
-        src_lang: str = "eng",
-        tgt_texts: Optional[list[str]] = None,
-        tgt_lang: str = "fra",
-        **kwargs,
-    ) -> BatchEncoding:
-        self.src_lang = src_lang
-        self.tgt_lang = tgt_lang
-        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
-
-    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_input_mode
-    def _switch_to_input_mode(self):
-        return self.set_src_lang_special_tokens(self.src_lang)
-
-    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_target_mode
-    def _switch_to_target_mode(self):
-        return self.set_tgt_lang_special_tokens(self.tgt_lang)
-
-    def set_src_lang_special_tokens(self, src_lang) -> None:
-        """Reset the special tokens to the source lang setting.
-        Prefix=[src_lang_code], suffix = [eos]
-        """
-        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
-
-        if self.cur_lang_code == self.unk_token_id:
-            logger.warning_once(
-                f"`tgt_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
-            )
-
-        self.init_kwargs["src_lang"] = src_lang
-
-        self.prefix_tokens = [self.cur_lang_code]
-        self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    def set_tgt_lang_special_tokens(self, lang: str) -> None:
-        """Reset the special tokens to the target lang setting.
-        Prefix=[eos, tgt_lang_code] and suffix=[eos].
-        """
-        self.cur_lang_code = self.convert_tokens_to_ids(lang)
-
-        if self.cur_lang_code == self.unk_token_id:
-            logger.warning_once(
-                f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
-            )
-
-        self.init_kwargs["tgt_lang"] = lang
-
-        self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
-        self.suffix_tokens = [self.eos_token_id]
-
-        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
-        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
-
-        self._tokenizer.post_processor = processors.TemplateProcessing(
-            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
-            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
-            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
-        )
-
-    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-    @classmethod
-    def _from_pretrained(
-        cls,
-        resolved_vocab_files,
-        pretrained_model_name_or_path,
-        init_configuration,
-        *init_inputs,
-        token=None,
-        cache_dir=None,
-        local_files_only=False,
-        _commit_hash=None,
-        _is_local=False,
-        **kwargs,
-    ):
-        tokenizer = super()._from_pretrained(
-            resolved_vocab_files,
-            pretrained_model_name_or_path,
-            init_configuration,
-            *init_inputs,
-            token=token,
-            cache_dir=cache_dir,
-            local_files_only=local_files_only,
-            _commit_hash=_commit_hash,
-            _is_local=_is_local,
-            **kwargs,
-        )
-
-        # ensure also set after from pretrained
-        tokenizer.set_src_lang_special_tokens(tokenizer._src_lang)
-        tokenizer.set_tgt_lang_special_tokens(tokenizer._tgt_lang)
-
-        return tokenizer
-
-    def __call__(
-        self,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_pair_target: Optional[
-            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
-        ] = None,
-        padding: Union[bool, str, PaddingStrategy] = True,
-        pad_to_multiple_of: Optional[int] = 2,
-        src_lang: Optional[str] = None,
-        tgt_lang: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
-                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                 index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value.
-
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            src_lang (`str`, *optional*):
-                A string representing the source language. If not specified, the last `src_lang` specified (either
-                during initialization or when calling this tokenizer) will be used.
-            tgt_lang (`str`, *optional*):
-                A string representing the target language. If not specified, the last `tgt_lang` specified (either
-                during initialization or when calling this tokenizer) will be used.
-            kwargs (*optional*):
-                Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizerFast.__call__`].
-        """
-        if src_lang is not None:
-            self.src_lang = src_lang
-        if tgt_lang is not None:
-            self.tgt_lang = tgt_lang
-
-        output = super().__call__(
-            text=text,
-            text_pair=text_pair,
-            text_target=text_target,
-            text_pair_target=text_pair_target,
-            padding=padding,
-            pad_to_multiple_of=pad_to_multiple_of,
-            **kwargs,
-        )
-
-        return output
-
-
-__all__ = ["SeamlessM4TTokenizerFast"]
diff --git a/src/transformers/models/siglip/tokenization_siglip.py b/src/transformers/models/siglip/tokenization_siglip.py
index 7a82a7a411ab..95007c864f11 100644
--- a/src/transformers/models/siglip/tokenization_siglip.py
+++ b/src/transformers/models/siglip/tokenization_siglip.py
@@ -23,9 +23,8 @@
 
 import sentencepiece as spm
 
-from ...convert_slow_tokenizer import import_protobuf
-from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import AddedToken
+from ...tokenization_utils_sentencepiece import SentencePieceBackend
 
 
 if TYPE_CHECKING:
@@ -43,7 +42,7 @@
 
 
 @requires(backends=("sentencepiece",))
-class SiglipTokenizer(PreTrainedTokenizer):
+class SiglipTokenizer(SentencePieceBackend):
     """
     Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
@@ -118,14 +117,10 @@ def __init__(
         )
 
         self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
         self.do_lower_case = do_lower_case
-        self.vocab_file = vocab_file
-
-        self.sp_model = self.get_spm_processor()
-        self.vocab_file = vocab_file
 
         super().__init__(
+            vocab_file=vocab_file,
             eos_token=eos_token,
             unk_token=unk_token,
             pad_token=pad_token,
@@ -136,31 +131,15 @@ def __init__(
             **kwargs,
         )
 
-    def get_spm_processor(self):
-        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        with open(self.vocab_file, "rb") as f:
-            sp_model = f.read()
-            model_pb2 = import_protobuf()
-            model = model_pb2.ModelProto.FromString(sp_model)
-            normalizer_spec = model_pb2.NormalizerSpec()
-            normalizer_spec.add_dummy_prefix = False
-            model.normalizer_spec.MergeFrom(normalizer_spec)
-            sp_model = model.SerializeToString()
-            tokenizer.LoadFromSerializedProto(sp_model)
-        return tokenizer
-
     @property
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.vocab_size
     def vocab_size(self):
         return self.sp_model.get_piece_size()
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab
     def get_vocab(self):
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
     ) -> list[int]:
@@ -189,7 +168,6 @@ def get_special_tokens_mask(
             return ([0] * len(token_ids_0)) + [1]
         return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._add_eos_if_not_present
     def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
         """Do not add eos again if user already added it."""
         if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
@@ -201,7 +179,6 @@ def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
         else:
             return token_ids + [self.eos_token_id]
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -224,7 +201,6 @@ def create_token_type_ids_from_sequences(
             return len(token_ids_0 + eos) * [0]
         return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(
         self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
     ) -> list[int]:
@@ -251,13 +227,11 @@ def build_inputs_with_special_tokens(
             token_ids_1 = self._add_eos_if_not_present(token_ids_1)
             return token_ids_0 + token_ids_1
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__getstate__
     def __getstate__(self):
         state = self.__dict__.copy()
         state["sp_model"] = None
         return state
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__setstate__
     def __setstate__(self, d):
         self.__dict__ = d
 
@@ -282,6 +256,9 @@ def canonicalize_text(self, text, *, keep_punctuation_exact_string=None):
                 If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                 (but will still remove '{' and '}' that appear separately).
         """
+        if self.do_lower_case:
+            text = text.lower()
+
         if keep_punctuation_exact_string:
             text = keep_punctuation_exact_string.join(
                 self.remove_punctuation(part) for part in text.split(keep_punctuation_exact_string)
@@ -304,7 +281,6 @@ def tokenize(self, text: "TextInput", add_special_tokens=False, **kwargs) -> lis
         return tokens
 
     @property
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.unk_token_length
     def unk_token_length(self):
         return len(self.sp_model.encode(str(self.unk_token)))
 
@@ -328,12 +304,10 @@ def _tokenize(self, text, **kwargs):
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
         return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         token = self.sp_model.IdToPiece(index)
@@ -358,7 +332,6 @@ def convert_tokens_to_string(self, tokens):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string.strip()
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.save_vocabulary
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index e1e694cb9583..f31a476102fe 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -22,7 +22,7 @@
 
 import sentencepiece
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 from ...utils.import_utils import requires
 
diff --git a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
index 10a834f11a9e..cfc267486041 100644
--- a/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/speecht5/convert_speecht5_original_pytorch_checkpoint_to_pytorch.py
@@ -28,7 +28,7 @@
     SpeechT5Tokenizer,
     logging,
 )
-from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_python import AddedToken
 
 
 logging.set_verbosity_info()
diff --git a/src/transformers/models/speecht5/tokenization_speecht5.py b/src/transformers/models/speecht5/tokenization_speecht5.py
index de19db8568e5..6da12b3a3e6b 100644
--- a/src/transformers/models/speecht5/tokenization_speecht5.py
+++ b/src/transformers/models/speecht5/tokenization_speecht5.py
@@ -14,13 +14,9 @@
 # limitations under the License.
 """Tokenization class for SpeechT5."""
 
-import os
-from shutil import copyfile
 from typing import Any, Optional
 
-import sentencepiece as spm
-
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_sentencepiece import SentencePieceBackend
 from ...utils import logging
 from ...utils.import_utils import requires
 from .number_normalizer import EnglishNumberNormalizer
@@ -32,7 +28,7 @@
 
 
 @requires(backends=("sentencepiece",))
-class SpeechT5Tokenizer(PreTrainedTokenizer):
+class SpeechT5Tokenizer(SentencePieceBackend):
     """
     Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
@@ -77,6 +73,7 @@ class SpeechT5Tokenizer(PreTrainedTokenizer):
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    is_fast = False
 
     def __init__(
         self,
@@ -89,21 +86,21 @@ def __init__(
         sp_model_kwargs: Optional[dict[str, Any]] = None,
         **kwargs,
     ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        self.vocab_file = vocab_file
         self.normalize = normalize
         self._normalizer = None
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+        # Prepare sp_model_kwargs for parent class
+        if sp_model_kwargs is not None:
+            kwargs["sp_model_kwargs"] = sp_model_kwargs
 
+        # Call parent init (which will load sp_model)
         super().__init__(
+            vocab_file=vocab_file,
             bos_token=bos_token,
             eos_token=eos_token,
             unk_token=unk_token,
             pad_token=pad_token,
             normalize=normalize,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
@@ -115,10 +112,6 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
             text = self.normalizer(text)
         return (text, kwargs)
 
-    @property
-    def vocab_size(self):
-        return self.sp_model.get_piece_size()
-
     @property
     def normalizer(self):
         if self._normalizer is None:
@@ -129,59 +122,6 @@ def normalizer(self):
     def normalizer(self, value):
         self._normalizer = value
 
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
         """Build model inputs from a sequence by appending eos_token_id."""
         if token_ids_1 is None:
@@ -202,22 +142,26 @@ def get_special_tokens_mask(
             return ([0] * len(token_ids_0)) + suffix_ones
         return ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
 
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. SpeechT5 does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + token_ids_1 + eos) * [0]
 
 
 __all__ = ["SpeechT5Tokenizer"]
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index f2ccafe12d5a..757f063e6cee 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -16,11 +16,12 @@
 """Tokenization classes for Splinter."""
 
 import collections
-import os
-import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import WordPiece
+
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 
 
@@ -30,7 +31,6 @@
 
 
 def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
     with open(vocab_file, "r", encoding="utf-8") as reader:
         tokens = reader.readlines()
@@ -40,468 +40,157 @@ def load_vocab(vocab_file):
     return vocab
 
 
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class SplinterTokenizer(PreTrainedTokenizer):
+class SplinterTokenizer(TokenizersBackend):
     r"""
-    Construct a Splinter tokenizer. Based on WordPiece.
+    Construct a Splinter tokenizer (backed by HuggingFace's tokenizers library). Based on WordPiece.
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
+        vocab_file (`str`, *optional*):
+            Path to a vocabulary file.
+        tokenizer_file (`str`, *optional*):
+            Path to a tokenizers JSON file containing the serialization of a tokenizer.
         do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
         unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
         sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
+            The separator token, which is used when building a sequence from multiple sequences.
         pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
         cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+            The classifier token which is used when doing sequence classification.
         mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
+            The token used for masking values.
         question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
             The token used for constructing question representations.
         tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
         strip_accents (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
+            value for `lowercase`.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        question_token="[QUESTION]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
+        do_lower_case: bool = True,
+        unk_token: str = "[UNK]",
+        sep_token: str = "[SEP]",
+        pad_token: str = "[PAD]",
+        cls_token: str = "[CLS]",
+        mask_token: str = "[MASK]",
+        question_token: str = "[QUESTION]",
+        tokenize_chinese_chars: bool = True,
+        strip_accents: Optional[bool] = None,
+        vocab: Optional[dict] = None,
         **kwargs,
     ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
+        if vocab is not None:
+            self._vocab = (
+                {token: idx for idx, (token, _score) in enumerate(vocab)} if isinstance(vocab, list) else vocab
             )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-        self.question_token = question_token
+        else:
+            self._vocab = {
+                str(pad_token): 0,
+                str(unk_token): 1,
+                str(cls_token): 2,
+                str(sep_token): 3,
+                str(mask_token): 4,
+                str(question_token): 5,
+                ".": 6,
+            }
+
+        self._tokenizer = Tokenizer(WordPiece(self._vocab, unk_token=str(unk_token)))
+
+        self._tokenizer.normalizer = normalizers.BertNormalizer(
+            clean_text=True,
+            handle_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            lowercase=do_lower_case,
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
+        self._tokenizer.decoder = decoders.WordPiece(prefix="##")
+
+        tokenizer_object = self._tokenizer
+
         super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
+            tokenizer_object=tokenizer_object,
             unk_token=unk_token,
             sep_token=sep_token,
             pad_token=pad_token,
             cls_token=cls_token,
             mask_token=mask_token,
             question_token=question_token,
+            do_lower_case=do_lower_case,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
             **kwargs,
         )
 
-    @property
-    def question_token_id(self):
-        """
-        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
-        representation.
-        """
-        return self.convert_tokens_to_ids(self.question_token)
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a pair of sequence for question answering tasks by concatenating and adding special
-        tokens. A Splinter sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
-
-        Args:
-            token_ids_0 (`list[int]`):
-                The question token IDs if pad_on_right, else context tokens IDs
-            token_ids_1 (`list[int]`, *optional*):
-                The context token IDs if pad_on_right, else question token IDs
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
-        if self.padding_side == "right":
-            # Input is question-then-context
-            return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
-        else:
-            # Input is context-then-question
-            return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create the token type IDs corresponding to the sequences passed. [What are token type
-        IDs?](../glossary#token-type-ids)
-
-        Should be overridden in a subclass if the model has a special way of building those.
-
-        Args:
-            token_ids_0 (`list[int]`): The first tokenized sequence.
-            token_ids_1 (`list[int]`, *optional*): The second tokenized sequence.
-
-        Returns:
-            `list[int]`: The token type ids.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
+        if hasattr(self, "_tokenizer") and self._tokenizer.normalizer is not None:
+            import json
+
+            pre_tok_state = json.loads(self._tokenizer.normalizer.__getstate__())
+            if (
+                pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
+                or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
+                or pre_tok_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+            ):
+                pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
+                pre_tok_state["lowercase"] = do_lower_case
+                pre_tok_state["strip_accents"] = strip_accents
+                pre_tok_state["handle_chinese_chars"] = tokenize_chinese_chars
+                self._tokenizer.normalizer = pre_tok_class(**pre_tok_state)
 
-        if self.padding_side == "right":
-            # Input is question-then-context
-            return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
-        else:
-            # Input is context-then-question
-            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-    """
-
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
-        if never_split is None:
-            never_split = []
         self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
         self.tokenize_chinese_chars = tokenize_chinese_chars
         self.strip_accents = strip_accents
+        self.question_token = question_token
+        if self.question_token not in self.all_special_tokens:
+            self.add_tokens([self.question_token], special_tokens=True)
+        self.update_post_processor()
 
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
-        WordPieceTokenizer.
-
-        Args:
-            **never_split**: (*optional*) list of str
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through *BasicTokenizer*.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
+    @property
+    def question_token_id(self):
+        return self.convert_tokens_to_ids(self.question_token)
 
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
+    def update_post_processor(self):
+        cls = self.cls_token
+        sep = self.sep_token
+        question = self.question_token
+        dot = "."
+        cls_token_id = self.cls_token_id
+        sep_token_id = self.sep_token_id
+        question_token_id = self.question_token_id
+        dot_token_id = self.convert_tokens_to_ids(".")
 
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
+        if cls is None or sep is None:
+            return
 
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+        if self.padding_side == "right":
+            pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1"
+        else:
+            pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1"
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{cls}:0 $A:0 {sep}:0",
+            pair=pair,
+            special_tokens=[
+                (cls, cls_token_id),
+                (sep, sep_token_id),
+                (question, question_token_id),
+                (dot, dot_token_id),
+            ],
+        )
 
 
 __all__ = ["SplinterTokenizer"]
diff --git a/src/transformers/models/splinter/tokenization_splinter_fast.py b/src/transformers/models/splinter/tokenization_splinter_fast.py
deleted file mode 100644
index 548d8e6c63b0..000000000000
--- a/src/transformers/models/splinter/tokenization_splinter_fast.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# coding=utf-8
-# Copyright 2021 Tel AViv University, AllenAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for Splinter."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_splinter import SplinterTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-class SplinterTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" Splinter tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
-            The token used for constructing question representations.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = SplinterTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        question_token="[QUESTION]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            additional_special_tokens=(question_token,),
-            **kwargs,
-        )
-
-        pre_tok_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            pre_tok_state.get("lowercase", do_lower_case) != do_lower_case
-            or pre_tok_state.get("strip_accents", strip_accents) != strip_accents
-        ):
-            pre_tok_class = getattr(normalizers, pre_tok_state.pop("type"))
-            pre_tok_state["lowercase"] = do_lower_case
-            pre_tok_state["strip_accents"] = strip_accents
-            self.backend_tokenizer.normalizer = pre_tok_class(**pre_tok_state)
-
-        self.do_lower_case = do_lower_case
-
-    @property
-    def question_token_id(self):
-        """
-        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
-        representation.
-        """
-        return self.convert_tokens_to_ids(self.question_token)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a pair of sequence for question answering tasks by concatenating and adding special
-        tokens. A Splinter sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
-
-        Args:
-            token_ids_0 (`list[int]`):
-                The question token IDs if pad_on_right, else context tokens IDs
-            token_ids_1 (`list[int]`, *optional*):
-                The context token IDs if pad_on_right, else question token IDs
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
-        if self.padding_side == "right":
-            # Input is question-then-context
-            return cls + token_ids_0 + question_suffix + sep + token_ids_1 + sep
-        else:
-            # Input is context-then-question
-            return cls + token_ids_0 + sep + token_ids_1 + question_suffix + sep
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create the token type IDs corresponding to the sequences passed. [What are token type
-        IDs?](../glossary#token-type-ids)
-
-        Should be overridden in a subclass if the model has a special way of building those.
-
-        Args:
-            token_ids_0 (`list[int]`): The first tokenized sequence.
-            token_ids_1 (`list[int]`, *optional*): The second tokenized sequence.
-
-        Returns:
-            `list[int]`: The token type ids.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        question_suffix = [self.question_token_id] + [self.convert_tokens_to_ids(".")]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-
-        if self.padding_side == "right":
-            # Input is question-then-context
-            return len(cls + token_ids_0 + question_suffix + sep) * [0] + len(token_ids_1 + sep) * [1]
-        else:
-            # Input is context-then-question
-            return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + question_suffix + sep) * [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["SplinterTokenizerFast"]
diff --git a/src/transformers/models/squeezebert/__init__.py b/src/transformers/models/squeezebert/__init__.py
index e0a760d2b5ca..9008fbeae8b0 100644
--- a/src/transformers/models/squeezebert/__init__.py
+++ b/src/transformers/models/squeezebert/__init__.py
@@ -20,8 +20,7 @@
 if TYPE_CHECKING:
     from .configuration_squeezebert import *
     from .modeling_squeezebert import *
-    from .tokenization_squeezebert import *
-    from .tokenization_squeezebert_fast import *
+    from .tokenization_squeezebert import SqueezeBertTokenizer, SqueezeBertTokenizerFast
 else:
     import sys
 
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index 4834f6681667..01f812a8e510 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,471 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for SqueezeBERT."""
+"""Tokenization classes for SqueezeBERT model."""
 
-import collections
-import os
-import unicodedata
-from typing import Optional
+from ..bert.tokenization_bert import BertTokenizer
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
-from ...utils import logging
 
+# SqueezeBertTokenizer is an alias for BertTokenizer
+SqueezeBertTokenizer = BertTokenizer
 
-logger = logging.get_logger(__name__)
+# SqueezeBertTokenizerFast is an alias for SqueezeBertTokenizer (since BertTokenizer is already a fast tokenizer)
+SqueezeBertTokenizerFast = SqueezeBertTokenizer
 
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-
-# Copied from transformers.models.bert.tokenization_bert.load_vocab
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-
-
-# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-# Copied from transformers.models.bert.tokenization_bert.BertTokenizer with Bert->SqueezeBert,BERT->SqueezeBERT
-class SqueezeBertTokenizer(PreTrainedTokenizer):
-    r"""
-    Construct a SqueezeBERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
-            Whether or not to do basic tokenization before WordPiece.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original SqueezeBERT).
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
-            extra spaces.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        clean_up_tokenization_spaces=True,
-        **kwargs,
-    ):
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
-                " model use `tokenizer = SqueezeBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case,
-                never_split=never_split,
-                tokenize_chinese_chars=tokenize_chinese_chars,
-                strip_accents=strip_accents,
-            )
-
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
-
-        super().__init__(
-            do_lower_case=do_lower_case,
-            do_basic_tokenize=do_basic_tokenize,
-            never_split=never_split,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            **kwargs,
-        )
-
-    @property
-    def do_lower_case(self):
-        return self.basic_tokenizer.do_lower_case
-
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-
-    def _tokenize(self, text, split_special_tokens=False):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(
-                text, never_split=self.all_special_tokens if not split_special_tokens else None
-            ):
-                # If the token is part of the never_split set
-                if token in self.basic_tokenizer.never_split:
-                    split_tokens.append(token)
-                else:
-                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A SqueezeBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        index = 0
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-            )
-        else:
-            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-
-
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
-class BasicTokenizer:
-    """
-    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
-
-    Args:
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        never_split (`Iterable`, *optional*):
-            Collection of tokens which will never be split during tokenization. Only has an effect when
-            `do_basic_tokenize=True`
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters.
-
-            This should likely be deactivated for Japanese (see this
-            [issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original BERT).
-        do_split_on_punc (`bool`, *optional*, defaults to `True`):
-            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
-            the full context of the words, such as contractions.
-    """
-
-    def __init__(
-        self,
-        do_lower_case=True,
-        never_split=None,
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        do_split_on_punc=True,
-    ):
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = set(never_split)
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-        self.strip_accents = strip_accents
-        self.do_split_on_punc = do_split_on_punc
-
-    def tokenize(self, text, never_split=None):
-        """
-        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            never_split (`List[str]`, *optional*)
-                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
-        """
-        # union() returns a new set by concatenating the two sets.
-        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
-        text = self._clean_text(text)
-
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        # prevents treating the same character with different unicode codepoints as different characters
-        unicode_normalized_text = unicodedata.normalize("NFC", text)
-        orig_tokens = whitespace_tokenize(unicode_normalized_text)
-        split_tokens = []
-        for token in orig_tokens:
-            if token not in never_split:
-                if self.do_lower_case:
-                    token = token.lower()
-                    if self.strip_accents is not False:
-                        token = self._run_strip_accents(token)
-                elif self.strip_accents:
-                    token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if not self.do_split_on_punc or (never_split is not None and text in never_split):
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)
-            or (cp >= 0x20000 and cp <= 0x2A6DF)
-            or (cp >= 0x2A700 and cp <= 0x2B73F)
-            or (cp >= 0x2B740 and cp <= 0x2B81F)
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)
-        ):
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-class WordpieceTokenizer:
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """
-        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
-        tokenization using the given vocabulary.
-
-        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
-
-        Args:
-            text: A single token or whitespace separated tokens. This should have
-                already been passed through *BasicTokenizer*.
-
-        Returns:
-            A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-__all__ = ["SqueezeBertTokenizer"]
+__all__ = ["SqueezeBertTokenizer", "SqueezeBertTokenizerFast"]
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
deleted file mode 100644
index b6e460d43a64..000000000000
--- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The SqueezeBert authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for SqueezeBERT."""
-
-import json
-from typing import Optional
-
-from tokenizers import normalizers
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .tokenization_squeezebert import SqueezeBertTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
-
-
-# Copied from transformers.models.bert.tokenization_bert_fast.BertTokenizerFast with Bert->SqueezeBert,BERT->SqueezeBERT
-class SqueezeBertTokenizerFast(PreTrainedTokenizerFast):
-    r"""
-    Construct a "fast" SqueezeBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            File containing the vocabulary.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether or not to lowercase the input when tokenizing.
-        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        clean_text (`bool`, *optional*, defaults to `True`):
-            Whether or not to clean the text before tokenization by removing any control characters and replacing all
-            whitespaces by the classic one.
-        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
-            issue](https://github.com/huggingface/transformers/issues/328)).
-        strip_accents (`bool`, *optional*):
-            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for `lowercase` (as in the original SqueezeBERT).
-        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
-            The prefix for subwords.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class = SqueezeBertTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=True,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        strip_accents=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            tokenize_chinese_chars=tokenize_chinese_chars,
-            strip_accents=strip_accents,
-            **kwargs,
-        )
-
-        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
-        if (
-            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
-            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
-            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
-        ):
-            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
-            normalizer_state["lowercase"] = do_lower_case
-            normalizer_state["strip_accents"] = strip_accents
-            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
-            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
-
-        self.do_lower_case = do_lower_case
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A SqueezeBERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-
-        if token_ids_1 is not None:
-            output += token_ids_1 + [self.sep_token_id]
-
-        return output
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-        return tuple(files)
-
-
-__all__ = ["SqueezeBertTokenizerFast"]
diff --git a/src/transformers/models/t5/__init__.py b/src/transformers/models/t5/__init__.py
index cdbf8a9937a7..5ba44f774dba 100644
--- a/src/transformers/models/t5/__init__.py
+++ b/src/transformers/models/t5/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_t5 import *
     from .modeling_t5 import *
     from .tokenization_t5 import *
-    from .tokenization_t5_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 0a25271345cf..6b5b660d281a 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -14,43 +14,30 @@
 # limitations under the License.
 """Tokenization class for model T5."""
 
-import os
 import re
-import warnings
-from shutil import copyfile
-from typing import TYPE_CHECKING, Any, Optional
 
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...convert_slow_tokenizer import import_protobuf
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import AddedToken
-
-
-if TYPE_CHECKING:
-    from ...tokenization_utils_base import TextInput
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
-
-
-SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
 
 
-@requires(backends=("sentencepiece",))
-class T5Tokenizer(PreTrainedTokenizer):
+class T5Tokenizer(TokenizersBackend):
     """
-    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
         eos_token (`str`, *optional*, defaults to `""`):
@@ -69,86 +56,34 @@ class T5Tokenizer(PreTrainedTokenizer):
         pad_token (`str`, *optional*, defaults to `""`):
             The token used for padding, for example when batching sequences of different lengths.
         extra_ids (`int`, *optional*, defaults to 100):
-           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
-            accessible as "" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
-            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
-            method
-         additional_special_tokens (`list[str]`, *optional*):
+            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
+            "" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
+            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
+        additional_special_tokens (`list[str]`, *optional*):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-        legacy (`bool`, *optional*):
-            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
-            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
-            example:
-
-            - `legacy=True`:
-            ```python
-            >>> from transformers import T5Tokenizer
-
-            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
-            >>> tokenizer.encode("Hello .")
-            [8774, 32099, 3, 5, 1]
-            ```
-            - `legacy=False`:
-            ```python
-            >>> from transformers import T5Tokenizer
-
-            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
-            >>> tokenizer.encode("Hello .")  # the extra space `[3]` is no longer here
-            [8774, 32099, 5, 1]
-            ```
-            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+        vocab (`dict`, *optional*):
+            Custom vocabulary dict. If not provided, a minimal vocabulary is created using the special tokens.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
         eos_token="",
         unk_token="",
         pad_token="",
         extra_ids=100,
         additional_special_tokens=None,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
-        legacy=None,
-        add_prefix_space=True,
+        vocab=None,
+        vocab_file=None,
         **kwargs,
-    ) -> None:
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
+    ):
         self.vocab_file = vocab_file
         self._extra_ids = extra_ids
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
+        # Handle extra_ids and additional_special_tokens
         if additional_special_tokens is not None:
             extra_tokens = [x for x in additional_special_tokens if "" for i in range(extra_ids)]
             additional_special_tokens = extra_tokens
 
-        # for legacy purpose, we keep this. Will be removed and tests updated. (when `added_tokens_decoder` is not passed as kwargs)
-        self._added_tokens_decoder = {}
-        for i in range(len(extra_tokens)):
-            self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken(
-                f"", single_word=False, lstrip=True, rstrip=True, special=True, normalized=False
+        # T5 vocab structure: =0, =1, =2, then regular vocab, then extra_ids in reverse
+        if vocab is not None:
+            self._vocab_scores = vocab
+        else:
+            self._vocab_scores = [
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(unk_token), 0.0),
+                ("▁", -2.0),  # Space token
+            ]
+            for i in range(extra_ids - 1, -1, -1):
+                self._vocab_scores.append((f"", 0.0))
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab_scores,
+                unk_id=2,
+                byte_fallback=False,
             )
+        )
 
-        if legacy is None:
-            logger.warning_once(
-                f"You are using the default legacy behaviour of the {self.__class__}. This is"
-                " expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you."
-                " If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it"
-                " means, and thoroughly read the reason why this was added as explained in"
-                " https://github.com/huggingface/transformers/pull/24565"
-            )
-            legacy = True
+        self._tokenizer.normalizer = None
+
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.WhitespaceSplit(),
+                pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
+            ]
+        )
+
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
-        self.legacy = legacy
-        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
-        self.add_prefix_space = add_prefix_space
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             eos_token=eos_token,
             unk_token=unk_token,
             pad_token=pad_token,
             extra_ids=extra_ids,
             additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            legacy=legacy,
-            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
-    def get_spm_processor(self, from_slow=False):
-        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        if self.legacy or from_slow:  # no dependency on protobuf
-            tokenizer.Load(self.vocab_file)
-            return tokenizer
-
-        with open(self.vocab_file, "rb") as f:
-            sp_model = f.read()
-            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
-            model = model_pb2.ModelProto.FromString(sp_model)
-            normalizer_spec = model_pb2.NormalizerSpec()
-            normalizer_spec.add_dummy_prefix = False
-            model.normalizer_spec.MergeFrom(normalizer_spec)
-            sp_model = model.SerializeToString()
-            tokenizer.LoadFromSerializedProto(sp_model)
-        return tokenizer
-
-    @staticmethod
-    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
-        if pretrained_model_name_or_path in T5Tokenizer.max_model_input_sizes:
-            deprecated_max_model_length = T5Tokenizer.max_model_input_sizes[pretrained_model_name_or_path]
-            if init_max_model_length is not None and init_max_model_length != max_model_length:
-                return init_max_model_length
-            elif init_max_model_length is None:
-                warnings.warn(
-                    "This tokenizer was incorrectly instantiated with a model max length of"
-                    f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
-                    " behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
-                    " `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
-                    f" {pretrained_model_name_or_path} automatically truncating your input to"
-                    f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
-                    f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
-                    " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
-                    " instantiate this tokenizer with `model_max_length` set to your preferred value.",
-                    FutureWarning,
-                )
-
-        return max_model_length
-
-    @property
-    def vocab_size(self):
-        return self.sp_model.get_piece_size()
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        # normal case: some special tokens
-        if token_ids_1 is None:
-            return ([0] * len(token_ids_0)) + [1]
-        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=["$A", ""],
+            pair=["$A", "", "$B", ""],
+            special_tokens=[
+                ("", self.eos_token_id),
+            ],
+        )
 
     def get_sentinel_tokens(self):
+        """Get the list of sentinel tokens (extra_id tokens) from additional_special_tokens."""
         return list(
             set(filter(lambda x: bool(re.search(r"", x)) is not None, self.additional_special_tokens))
         )
 
     def get_sentinel_token_ids(self):
+        """Get the token IDs for sentinel tokens."""
         return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
 
-    def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
-        """Do not add eos again if user already added it."""
-        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
-            warnings.warn(
-                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
-                " eos tokens being added."
-            )
-            return token_ids
-        else:
-            return token_ids + [self.eos_token_id]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
-        use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        eos = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + eos) * [0]
-        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A sequence has the following format:
-
-        - single sequence: `X `
-        - pair of sequences: `A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
-        if token_ids_1 is None:
-            return token_ids_0
-        else:
-            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
-            return token_ids_0 + token_ids_1
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
-        first token is special.
-        """
-        if self.legacy or len(text) == 0:
-            return super().tokenize(text, **kwargs)
-
-        text = text.replace(SPIECE_UNDERLINE, " ")
-        if self.add_prefix_space:
-            text = SPIECE_UNDERLINE + text
-
-        tokens = super().tokenize(text, **kwargs)
-
-        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
-            tokens = tokens[1:]
-        return tokens
-
-    @property
-    def unk_token_length(self):
-        return len(self.sp_model.encode(str(self.unk_token)))
-
-    def _tokenize(self, text, **kwargs):
-        """
-        Returns a tokenized string.
-
-        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
-        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
-        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
-        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
-        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
-        """
-        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
-            return self.sp_model.encode(text, out_type=str)
-
-        # 1. Encode string + prefix ex: " Hey"
-        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
-        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
-        return token
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        # since we manually add the prefix space, we have to remove it when decoding
-        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
-            tokens[0] = tokens[0][1:]
-
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
 
 __all__ = ["T5Tokenizer"]
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
deleted file mode 100644
index bdba1a7928c8..000000000000
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright 2018 T5 Authors and HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization class for model T5."""
-
-import os
-import re
-import warnings
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_t5 import T5Tokenizer
-else:
-    T5Tokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-# TODO(PVP) - this should be removed in Transformers v5
-
-
-class T5TokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (`int`, *optional*, defaults to 100):
-            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
-            "" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
-            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
-        additional_special_tokens (`list[str]`, *optional*):
-            Additional special tokens used by the tokenizer.
-        add_prefix_space (`bool`, *optional*):
-            Whether or not the tokenizer should automatically add a prefix space
-        from_slow (`book`, *optional*, defaults to `False`):
-            Whether or not the tokenizer should be converted from a slow one. If `add_prefix_space` is set, this will be set to `True`.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = T5Tokenizer
-
-    prefix_tokens: list[int] = []
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        eos_token="",
-        unk_token="",
-        pad_token="",
-        extra_ids=100,
-        additional_special_tokens=None,
-        add_prefix_space=None,
-        **kwargs,
-    ):
-        # Add extra_ids to the special token list
-        if additional_special_tokens is not None:
-            extra_tokens = [x for x in additional_special_tokens if "" for i in range(extra_ids)]
-            elif extra_ids > 0 and extra_ids != len(extra_tokens):
-                raise ValueError(
-                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
-                    " provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids"
-                    " tokens"
-                )
-        else:
-            extra_tokens = [f"" for i in range(extra_ids)]
-            additional_special_tokens = extra_tokens
-
-        if add_prefix_space is not None:
-            logger.warning_once(
-                "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
-            )
-            kwargs["from_slow"] = True
-
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            extra_ids=extra_ids,
-            additional_special_tokens=additional_special_tokens,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-        self._extra_ids = extra_ids
-
-    @staticmethod
-    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
-        if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes:
-            deprecated_max_model_length = T5TokenizerFast.max_model_input_sizes[pretrained_model_name_or_path]
-            if init_max_model_length is not None and init_max_model_length != max_model_length:
-                return init_max_model_length
-            elif init_max_model_length is None:
-                warnings.warn(
-                    "This tokenizer was incorrectly instantiated with a model max length of"
-                    f" {deprecated_max_model_length} which will be corrected in Transformers v5.\nFor now, this"
-                    " behavior is kept to avoid breaking backwards compatibility when padding/encoding with"
-                    " `truncation is True`.\n- Be aware that you SHOULD NOT rely on"
-                    f" {pretrained_model_name_or_path} automatically truncating your input to"
-                    f" {deprecated_max_model_length} when padding/encoding.\n- If you want to encode/pad to sequences"
-                    f" longer than {deprecated_max_model_length} you can either instantiate this tokenizer with"
-                    " `model_max_length` or pass `max_length` when encoding/padding.\n- To avoid this warning, please"
-                    " instantiate this tokenizer with `model_max_length` set to your preferred value.",
-                    FutureWarning,
-                )
-
-        return max_model_length
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-            logger.info(f"Copy vocab file to {out_vocab_file}")
-
-        return (out_vocab_file,)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A sequence has the following format:
-
-        - single sequence: `X `
-        - pair of sequences: `A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        token_ids_0 = token_ids_0 + [self.eos_token_id]
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0
-        else:
-            token_ids_1 = token_ids_1 + [self.eos_token_id]
-            return self.prefix_tokens + token_ids_0 + token_ids_1
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
-        use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        eos = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + eos) * [0]
-        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
-
-    def get_sentinel_tokens(self):
-        return list(
-            set(filter(lambda x: bool(re.search(r"", x)) is not None, self.additional_special_tokens))
-        )
-
-    def get_sentinel_token_ids(self):
-        return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
-
-
-__all__ = ["T5TokenizerFast"]
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index ec8548636d5d..df446826badb 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -28,7 +28,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ...tokenization_python import PreTrainedTokenizer, Trie, _is_control, _is_punctuation, _is_whitespace
 from ...tokenization_utils_base import (
     ENCODE_KWARGS_DOCSTRING,
     VERY_LARGE_INTEGER,
@@ -330,6 +330,20 @@ def __init__(
             **kwargs,
         )
 
+        # Tests override the vocab while reusing a tokenizer_config.json coming from a pretrained model.
+        # This can register base vocab tokens (like [UNK]) as added tokens with mismatched ids (e.g. 100)
+        # and breaks assumptions on token ordering. Drop any added-token entry that overlaps with the vocab
+        # so these tokens rely on the vocab-provided ids.
+        removed_overlap = False
+        for token, added_id in list(self._added_tokens_encoder.items()):
+            if token in self.vocab:
+                self._added_tokens_encoder.pop(token, None)
+                self._added_tokens_decoder.pop(added_id, None)
+                removed_overlap = True
+        if removed_overlap:
+            self.tokens_trie = Trie()
+            self._update_trie()
+
     @property
     def do_lower_case(self):
         return self.basic_tokenizer.do_lower_case
@@ -343,7 +357,7 @@ def get_vocab(self):
 
     def _tokenize(self, text):
         if format_text(text) == EMPTY_TEXT:
-            return [self.additional_special_tokens[0]]
+            return [self.extra_special_tokens[0]]
         split_tokens = []
         if self.do_basic_tokenize:
             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
@@ -504,7 +518,7 @@ def get_special_tokens_mask(
     @add_end_docstrings(TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
-        table: "pd.DataFrame",
+        table: Union["pd.DataFrame", TextInput, list[TextInput], None],
         queries: Optional[
             Union[
                 TextInput,
@@ -537,9 +551,10 @@ def __call__(
         Main method to tokenize and prepare for the model one or several sequence(s) related to a table.
 
         Args:
-            table (`pd.DataFrame`):
+            table (`pd.DataFrame` or `str` or `list[str]`):
                 Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
-                dataframe to convert it to string.
+                dataframe to convert it to string. When passing a string or list of strings, those will be interpreted
+                as queries with an empty table (to support generic tokenizer tests).
             queries (`str` or `list[str]`):
                 Question or batch of questions related to a table to be encoded. Note that in case of a batch, all
                 questions must refer to the **same** table.
@@ -557,7 +572,12 @@ def __call__(
                 then the answer_coordinates must be a list of lists of strings (each list corresponding to a single
                 table-question pair).
         """
-        assert isinstance(table, pd.DataFrame), "Table must be of type pd.DataFrame"
+        if not isinstance(table, pd.DataFrame):
+            if queries is not None:
+                raise AssertionError("Table must be of type pd.DataFrame when queries are provided separately.")
+            inferred_queries = table
+            table = pd.DataFrame.from_dict({})
+            queries = inferred_queries
 
         # Input type checking for clearer error
         valid_query = False
@@ -869,7 +889,7 @@ def _batch_prepare_for_model(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
     def encode(
         self,
-        table: "pd.DataFrame",
+        table: Union["pd.DataFrame", TextInput, list[TextInput]],
         query: Optional[
             Union[
                 TextInput,
@@ -890,12 +910,19 @@ def encode(
         your own, otherwise refer to `__call__`.
 
         Args:
-            table (`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
-                dataframe to convert it to string.
+            table (`pd.DataFrame` or `str` or `list[str]`):
+                Table containing tabular data. When passing a string or list of strings, those will be interpreted as
+                queries with an empty table (to support generic tokenizer tests).
             query (`str` or `list[str]`):
                 Question related to a table to be encoded.
         """
+        if not isinstance(table, pd.DataFrame):
+            if query is not None:
+                raise AssertionError("Table must be of type pd.DataFrame when queries are provided separately.")
+            inferred_query = table
+            table = pd.DataFrame.from_dict({})
+            query = inferred_query
+
         encoded_inputs = self.encode_plus(
             table,
             query=query,
@@ -912,7 +939,7 @@ def encode(
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def encode_plus(
         self,
-        table: "pd.DataFrame",
+        table: Union["pd.DataFrame", TextInput, list[TextInput]],
         query: Optional[
             Union[
                 TextInput,
@@ -941,9 +968,9 @@ def encode_plus(
         Prepare a table and a string for the model.
 
         Args:
-            table (`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
-                dataframe to convert it to string.
+            table (`pd.DataFrame` or `str` or `list[str]`):
+                Table containing tabular data. When passing a string or list of strings, those will be interpreted as
+                queries with an empty table (to support generic tokenizer tests).
             query (`str` or `list[str]`):
                 Question related to a table to be encoded.
             answer_coordinates (`list[Tuple]` or `list[list[Tuple]]`, *optional*):
@@ -974,6 +1001,13 @@ def encode_plus(
                 "transformers.PreTrainedTokenizerFast."
             )
 
+        if not isinstance(table, pd.DataFrame):
+            if query is not None:
+                raise AssertionError("Table must be of type pd.DataFrame when queries are provided separately.")
+            inferred_query = table
+            table = pd.DataFrame.from_dict({})
+            query = inferred_query
+
         return self._encode_plus(
             table=table,
             query=query,
@@ -1985,7 +2019,6 @@ def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_clas
     # End of everything related to converting logits to predictions
 
 
-# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer
 class BasicTokenizer:
     """
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
@@ -2147,7 +2180,6 @@ def _clean_text(self, text):
         return "".join(output)
 
 
-# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer
 class WordpieceTokenizer:
     """Runs WordPiece tokenization."""
 
diff --git a/src/transformers/models/udop/__init__.py b/src/transformers/models/udop/__init__.py
index cf4c36f6363f..852ad5e3c81a 100644
--- a/src/transformers/models/udop/__init__.py
+++ b/src/transformers/models/udop/__init__.py
@@ -22,7 +22,6 @@
     from .modeling_udop import *
     from .processing_udop import *
     from .tokenization_udop import *
-    from .tokenization_udop_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 5e37a021e6ab..f15bd28c69c6 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -58,7 +58,7 @@ class UdopProcessor(ProcessorMixin):
     [`UdopProcessor`] offers all the functionalities you need to prepare data for the model.
 
     It first uses [`LayoutLMv3ImageProcessor`] to resize, rescale and normalize document images, and optionally applies OCR
-    to get words and normalized bounding boxes. These are then provided to [`UdopTokenizer`] or [`UdopTokenizerFast`],
+    to get words and normalized bounding boxes. These are then provided to [`UdopTokenizer`],
     which turns the words and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
     Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
     classification tasks (such as FUNSD, CORD).
@@ -69,8 +69,8 @@ class UdopProcessor(ProcessorMixin):
     Args:
         image_processor (`LayoutLMv3ImageProcessor`):
             An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
-            An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
+        tokenizer (`UdopTokenizer`):
+            An instance of [`UdopTokenizer`]. The tokenizer is a required input.
     """
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/udop/tokenization_udop.py b/src/transformers/models/udop/tokenization_udop.py
index 26eb7fa82e7a..1c13a43cce26 100644
--- a/src/transformers/models/udop/tokenization_udop.py
+++ b/src/transformers/models/udop/tokenization_udop.py
@@ -14,17 +14,12 @@
 # limitations under the License
 """Tokenization classes for UDOP model."""
 
-import os
-import re
-import warnings
-from shutil import copyfile
-from typing import Any, Optional, Union
+from typing import Optional, Union
 
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import (
-    AddedToken,
     BatchEncoding,
     EncodedInput,
     PreTokenizedInput,
@@ -32,15 +27,14 @@
     TextInputPair,
     TruncationStrategy,
 )
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
-from ...utils.import_utils import requires
 
 
-logger = logging.get_logger(__name__)
-
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
 
-SPIECE_UNDERLINE = "▁"
 
+logger = logging.get_logger(__name__)
 
 UDOP_ENCODE_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
@@ -144,22 +138,17 @@
             - **length** -- The length of the inputs (when `return_length=True`).
 """
 
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
 
-@requires(backends=("sentencepiece",))
-class UdopTokenizer(PreTrainedTokenizer):
+class UdopTokenizer(TokenizersBackend):
     """
-    Adapted from [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a "fast" UDOP tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-
         eos_token (`str`, *optional*, defaults to `""`):
             The end of sequence token.
 
@@ -170,15 +159,13 @@ class UdopTokenizer(PreTrainedTokenizer):
 
             
 
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-
         sep_token (`str`, *optional*, defaults to `""`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-
+        unk_token (`str`, *optional*, defaults to `""`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
         pad_token (`str`, *optional*, defaults to `""`):
             The token used for padding, for example when batching sequences of different lengths.
         sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
@@ -190,329 +177,105 @@ class UdopTokenizer(PreTrainedTokenizer):
             CrossEntropyLoss.
         only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-        legacy (`bool`, *optional*, defaults to `True`):
-            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
-            which includes fixes to properly handle tokens that appear after special tokens. A simple example:
-            - `legacy=True`:
-            ```python
-            >>> from transformers import T5Tokenizer
-
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=True)
-            >>> tokenizer.encode("Hello .")
-            [8774, 32099, 3, 5, 1]
-            ```
-            - `legacy=False`:
-            ```python
-            >>> from transformers import T5Tokenizer
-
-            >>> tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
-            >>> tokenizer.encode("Hello .")  # the extra space `[3]` is no longer here
-            [8774, 32099, 5, 1]
-            ```
-            Checkout the pull request and the issue [here](https://github.com/huggingface/transformers/pull/24565) for
-            more details.
-        add_prefix_space (`bool`, *optional*, defaults to `True`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word.
-
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+        extra_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
+            Extra special tokens used by the tokenizer.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
         eos_token="",
-        unk_token="",
         sep_token="",
+        unk_token="",
         pad_token="",
         sep_token_box=[1000, 1000, 1000, 1000],
         pad_token_box=[0, 0, 0, 0],
         pad_token_label=-100,
         only_label_first_subword=True,
-        additional_special_tokens=None,
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
-        legacy=True,
-        add_prefix_space=True,
+        extra_special_tokens=None,
+        vocab=None,
         **kwargs,
-    ) -> None:
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+    ):
+        if "additional_special_tokens" in kwargs and "extra_special_tokens" not in kwargs:
+            kwargs["extra_special_tokens"] = kwargs.pop("additional_special_tokens")
+        if extra_special_tokens is not None:
+            kwargs["extra_special_tokens"] = extra_special_tokens
+
+        if vocab is None:
+            vocab_scores = [(str(pad_token), 0.0), (str(eos_token), 0.0), (str(unk_token), 0.0), ("▁", -2.0)]
+        elif isinstance(vocab, dict):
+            vocab_scores = [(str(token), float(score)) for token, score in vocab.items()]
+        elif isinstance(vocab, list) and len(vocab) > 0:
+            if isinstance(vocab[0], (tuple, list)):
+                vocab_scores = [(str(token), float(score)) for token, score in vocab]
+            else:
+                vocab_scores = [(str(token), 0.0) for token in vocab]
+
+        unk_id = 2
+        for idx, (token, _) in enumerate(vocab_scores):
+            if token == str(unk_token):
+                unk_id = idx
+                break
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                vocab_scores,
+                unk_id=unk_id,
+                byte_fallback=False,
+            )
+        )
 
-        self.legacy = legacy
-        self.add_prefix_space = add_prefix_space
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self._tokenizer.normalizer = None
 
-        self.vocab_file = vocab_file
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.WhitespaceSplit(),
+                pre_tokenizers.Metaspace(replacement="▁", prepend_scheme="always", split=True),
+            ]
+        )
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
-
-        # additional properties
-        self.sep_token_box = sep_token_box
-        self.pad_token_box = pad_token_box
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme="always", split=True)
 
         super().__init__(
+            tokenizer_object=self._tokenizer,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             sep_token_box=sep_token_box,
             pad_token_box=pad_token_box,
             pad_token_label=pad_token_label,
             only_label_first_subword=only_label_first_subword,
-            additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
-            legacy=legacy,
-            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_vocab
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        # normal case: some special tokens
-        if token_ids_1 is None:
-            return ([0] * len(token_ids_0)) + [1]
-        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_sentinel_tokens
-    def get_sentinel_tokens(self):
-        return list(
-            set(filter(lambda x: bool(re.search(r"", x)) is not None, self.additional_special_tokens))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=["$A", ""],
+            pair=["$A", "", "$B", ""],
+            special_tokens=[
+                ("", self.eos_token_id),
+            ],
         )
 
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_sentinel_token_ids
-    def get_sentinel_token_ids(self):
-        return [self.convert_tokens_to_ids(token) for token in self.get_sentinel_tokens()]
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._add_eos_if_not_present
-    def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
-        """Do not add eos again if user already added it."""
-        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
-            warnings.warn(
-                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
-                " eos tokens being added."
-            )
-            return token_ids
-        else:
-            return token_ids + [self.eos_token_id]
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.create_token_type_ids_from_sequences
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
-        use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-        """
-        eos = [self.eos_token_id]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + eos) * [0]
-        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A sequence has the following format:
-
-        - single sequence: `X `
-        - pair of sequences: `A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
-        if token_ids_1 is None:
-            return token_ids_0
-        else:
-            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
-            return token_ids_0 + token_ids_1
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.__getstate__
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__.update(d)
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
-    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
-        """
-        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
-        first token is special.
-        """
-        if self.legacy or len(text) == 0:
-            return super().tokenize(text, **kwargs)
-
-        text = text.replace(SPIECE_UNDERLINE, " ")
-        if self.add_prefix_space:
-            text = SPIECE_UNDERLINE + text
-
-        tokens = super().tokenize(text, **kwargs)
-
-        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
-            tokens = tokens[1:]
-        return tokens
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
-    def _tokenize(self, text, **kwargs):
-        """
-        Returns a tokenized string.
-
-        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
-        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
-        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
-        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
-        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
-        """
-        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
-            return self.sp_model.encode(text, out_type=str)
-
-        # 1. Encode string + prefix ex: " Hey"
-        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
-        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
-        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.piece_to_id(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.convert_tokens_to_string
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        # since we manually add the prefix space, we have to remove it when decoding
-        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
-            tokens[0] = tokens[0][1:]
-
-        current_sub_tokens = []
-        out_string = ""
-        prev_is_special = False
-        for token in tokens:
-            # make sure that special tokens are not decoded using sentencepiece model
-            if token in self.all_special_tokens:
-                if not prev_is_special:
-                    out_string += " "
-                out_string += self.sp_model.decode(current_sub_tokens) + token
-                prev_is_special = True
-                current_sub_tokens = []
-            else:
-                current_sub_tokens.append(token)
-                prev_is_special = False
-        out_string += self.sp_model.decode(current_sub_tokens)
-        return out_string.strip()
-
-    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+        self.init_kwargs["vocab"] = vocab
 
-        return (out_vocab_file,)
+        self._tokenizer.encode_special_tokens = self.split_special_tokens
 
     @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
     def __call__(
         self,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
         boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
         word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        text_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         text_pair_target: Optional[
             Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
         ] = None,
@@ -523,14 +286,20 @@ def __call__(
         if text is not None:
             # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
             # input mode in this case.
-            if not self._in_target_context_manager:
+            if not self._in_target_context_manager and hasattr(self, "_switch_to_input_mode"):
                 self._switch_to_input_mode()
             encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
         if text_target is not None:
-            self._switch_to_target_mode()
-            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
+            if hasattr(self, "_switch_to_target_mode"):
+                self._switch_to_target_mode()
+            target_encodings = self._encode_plus(
+                text=text_target,
+                text_pair=text_pair_target,
+                **kwargs,
+            )
         # Leave back tokenizer in input mode
-        self._switch_to_input_mode()
+        if hasattr(self, "_switch_to_input_mode"):
+            self._switch_to_input_mode()
 
         if text_target is None:
             return encodings
@@ -540,6 +309,7 @@ def __call__(
             encodings["labels"] = target_encodings["input_ids"]
             return encodings
 
+    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
     def call_boxes(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
@@ -691,6 +461,17 @@ def _is_valid_text_input(t):
                 **kwargs,
             )
 
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+
+        self._tokenizer.encode_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
+
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
     def batch_encode_plus_boxes(
         self,
         batch_text_or_text_pairs: Union[
@@ -703,7 +484,7 @@ def batch_encode_plus_boxes(
         word_labels: Optional[list[list[int]]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
+        truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
         is_split_into_words: bool = False,
@@ -722,6 +503,12 @@ def batch_encode_plus_boxes(
         """
         Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
 
+        
+
+        This method is deprecated, `__call__` should be used instead.
+
+        
+
         Args:
             batch_text_or_text_pairs (`list[str]`, `list[tuple[str, str]]`, `list[list[str]]`, `list[tuple[list[str], list[str]]]`, and for not-fast tokenizers, also `list[list[int]]`, `list[tuple[list[int], list[int]]]`):
                 Batch of sequences or pair of sequences to be encoded. This can be a list of
@@ -763,127 +550,6 @@ def batch_encode_plus_boxes(
             **kwargs,
         )
 
-    def encode_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput, EncodedInput],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ) -> list[int]:
-        """
-        Args:
-        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
-        `self.convert_tokens_to_ids(self.tokenize(text))`.
-            text (`str`, `list[str]` or `list[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-        """
-        encoded_inputs = self.encode_plus_boxes(
-            text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
-
-        return encoded_inputs["input_ids"]
-
-    def encode_plus_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Optional[Union[bool, str, TruncationStrategy]] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences.
-
-        
-
-        This method is deprecated, `__call__` should be used instead.
-
-        
-
-        Args:
-            text (`str`, `list[str]` or (for non-fast tokenizers) `list[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus_boxes(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
     def _batch_encode_plus_boxes(
         self,
         batch_text_or_text_pairs: Union[
@@ -901,7 +567,7 @@ def _batch_encode_plus_boxes(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -911,109 +577,140 @@ def _batch_encode_plus_boxes(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
 
-        batch_outputs = self._batch_prepare_for_model_boxes(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
             padding_strategy=padding_strategy,
             truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
         )
 
-        return BatchEncoding(batch_outputs)
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
 
-    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
-    def _batch_prepare_for_model_boxes(
-        self,
-        batch_text_or_text_pairs,
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens
-
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
 
-        batch_outputs = {}
-        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
-            batch_text_or_text_pair, boxes_example = example
-            outputs = self.prepare_for_model_boxes(
-                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
-                batch_text_or_text_pair[1] if is_pair else None,
-                boxes_example,
-                word_labels=word_labels[idx] if word_labels is not None else None,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
+        # Convert encoding to dict
+        # `Tokens` has type: tuple[
+        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #                       list[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
                 return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
                 return_overflowing_tokens=return_overflowing_tokens,
                 return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
                 return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
                 verbose=verbose,
             )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
 
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
 
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
 
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+        sanitized_tokens["bbox"] = token_boxes
 
-        return batch_outputs
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                previous_token_empty = False
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0 and not previous_token_empty:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                        if self.decode(id) == "":
+                            previous_token_empty = True
+                        else:
+                            previous_token_empty = False
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
 
     def _encode_plus_boxes(
         self,
@@ -1028,7 +725,7 @@ def _encode_plus_boxes(
         stride: int = 0,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
         return_overflowing_tokens: bool = False,
@@ -1038,49 +735,106 @@ def _encode_plus_boxes(
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
-
-        return self.prepare_for_model_boxes(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus_boxes(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
             add_special_tokens=add_special_tokens,
-            padding=padding_strategy.value,
-            truncation=truncation_strategy.value,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             pad_to_multiple_of=pad_to_multiple_of,
             padding_side=padding_side,
             return_tensors=return_tensors,
-            prepend_batch_axis=True,
-            return_attention_mask=return_attention_mask,
             return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
             return_overflowing_tokens=return_overflowing_tokens,
             return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
             return_length=return_length,
             verbose=verbose,
+            **kwargs,
         )
 
-    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
-    def prepare_for_model_boxes(
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    def encode_boxes(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> list[int]:
+        """
+        Args:
+        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
+        `self.convert_tokens_to_ids(self.tokenize(text))`.
+            text (`str`, `list[str]` or `list[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+        """
+        encoded_inputs = self.encode_plus_boxes(
+            text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    def encode_plus_boxes(
         self,
         text: Union[TextInput, PreTokenizedInput],
         text_pair: Optional[PreTokenizedInput] = None,
         boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
+        word_labels: Optional[list[list[int]]] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_length: Optional[int] = None,
         stride: int = 0,
+        is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         padding_side: Optional[str] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
@@ -1091,24 +845,26 @@ def prepare_for_model_boxes(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        prepend_batch_axis: bool = False,
         **kwargs,
     ) -> BatchEncoding:
         """
-        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
-        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens.
+        Tokenize and prepare for the model a sequence or a pair of sequences.
 
-        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
-        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
-        labeled with -100, such that they will be ignored by the loss function.
+        
+
+        This method is deprecated, `__call__` should be used instead.
+
+        
 
         Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (`list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
-                list of list of strings (words of a batch of examples).
+            text (`str`, `list[str]` or (for non-fast tokenizers) `list[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
+            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method).
         """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -1121,281 +877,30 @@ def prepare_for_model_boxes(
             **kwargs,
         )
 
-        tokens = []
-        pair_tokens = []
-        token_boxes = []
-        pair_token_boxes = []
-        labels = []
-
-        if text_pair is None:
-            if word_labels is None:
-                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
-                for word, box in zip(text, boxes):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-            else:
-                # CASE 2: token classification (training)
-                for word, box, label in zip(text, boxes, word_labels):
-                    if len(word) < 1:  # skip empty words
-                        continue
-                    word_tokens = self.tokenize(word)
-                    tokens.extend(word_tokens)
-                    token_boxes.extend([box] * len(word_tokens))
-                    if self.only_label_first_subword:
-                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
-                    else:
-                        labels.extend([label] * len(word_tokens))
-        else:
-            # CASE 3: document visual question answering (inference)
-            # text = question
-            # text_pair = words
-            tokens = self.tokenize(text)
-            token_boxes = [self.pad_token_box for _ in range(len(tokens))]
-
-            for word, box in zip(text_pair, boxes):
-                if len(word) < 1:  # skip empty words
-                    continue
-                word_tokens = self.tokenize(word)
-                pair_tokens.extend(word_tokens)
-                pair_token_boxes.extend([box] * len(word_tokens))
-
-        # Create ids + pair_ids
-        ids = self.convert_tokens_to_ids(tokens)
-        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
-
-        # Compute the total size of the returned encodings
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            (
-                ids,
-                token_boxes,
-                pair_ids,
-                pair_token_boxes,
-                labels,
-                overflowing_tokens,
-                overflowing_token_boxes,
-                overflowing_labels,
-            ) = self.truncate_sequences(
-                ids,
-                token_boxes,
-                pair_ids=pair_ids,
-                pair_token_boxes=pair_token_boxes,
-                labels=labels,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
-            )
-
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
-            encoded_inputs["overflowing_labels"] = overflowing_labels
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            token_boxes = token_boxes + [self.sep_token_box]
-            if pair_token_boxes:
-                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
-            if labels:
-                labels = labels + [self.pad_token_label]
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-        if labels:
-            encoded_inputs["labels"] = labels
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm.LayoutXLMTokenizer.truncate_sequences
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        token_boxes: list[list[int]],
-        pair_ids: Optional[list[int]] = None,
-        pair_token_boxes: Optional[list[list[int]]] = None,
-        labels: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Truncates a sequence pair in-place following the strategy.
-
-        Args:
-            ids (`list[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            token_boxes (`list[list[int]]`):
-                Bounding boxes of the first sequence.
-            pair_ids (`list[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            pair_token_boxes (`list[list[int]]`, *optional*):
-                Bounding boxes of the second sequence.
-            labels (`list[int]`, *optional*):
-                Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                The strategy to follow for truncation. Can be:
-
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-
-        Returns:
-            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
-
-        overflowing_tokens = []
-        overflowing_token_boxes = []
-        overflowing_labels = []
-        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    if not overflowing_tokens:
-                        window_len = min(len(ids), stride + 1)
-                    else:
-                        window_len = 1
-                    overflowing_tokens.extend(ids[-window_len:])
-                    overflowing_token_boxes.extend(token_boxes[-window_len:])
-                    overflowing_labels.extend(labels[-window_len:])
-                    ids = ids[:-1]
-                    token_boxes = token_boxes[:-1]
-                    labels = labels[:-1]
-                else:
-                    if not overflowing_tokens:
-                        window_len = min(len(pair_ids), stride + 1)
-                    else:
-                        window_len = 1
-                    overflowing_tokens.extend(pair_ids[-window_len:])
-                    overflowing_token_boxes.extend(pair_token_boxes[-window_len:])
-                    pair_ids = pair_ids[:-1]
-                    pair_token_boxes = pair_token_boxes[:-1]
-        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                overflowing_tokens = ids[-window_len:]
-                overflowing_token_boxes = token_boxes[-window_len:]
-                overflowing_labels = labels[-window_len:]
-                ids = ids[:-num_tokens_to_remove]
-                token_boxes = token_boxes[:-num_tokens_to_remove]
-                labels = labels[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_second'."
-                )
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                overflowing_tokens = pair_ids[-window_len:]
-                overflowing_token_boxes = pair_token_boxes[-window_len:]
-                pair_ids = pair_ids[:-num_tokens_to_remove]
-                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
-
-        return (
-            ids,
-            token_boxes,
-            pair_ids,
-            pair_token_boxes,
-            labels,
-            overflowing_tokens,
-            overflowing_token_boxes,
-            overflowing_labels,
+        return self._encode_plus_boxes(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
         )
 
-    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm.LayoutXLMTokenizer._pad
     def _pad(
         self,
         encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
@@ -1485,5 +990,63 @@ def _pad(
 
         return encoded_inputs
 
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: ` X `
+        - pair of sequences: ` A  B `
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Save the tokenizer vocabulary files. For TokenizersBackend, the tokenizer.json file is saved
+        by the base class. This method returns an empty tuple since we only use tokenizer.json.
+        """
+        # The base class handles saving tokenizer.json in _save_pretrained
+        # We don't need to save vocab_file since we only use tokenizer.json
+        return ()
+
 
 __all__ = ["UdopTokenizer"]
diff --git a/src/transformers/models/udop/tokenization_udop_fast.py b/src/transformers/models/udop/tokenization_udop_fast.py
deleted file mode 100644
index 9751f5d65ddf..000000000000
--- a/src/transformers/models/udop/tokenization_udop_fast.py
+++ /dev/null
@@ -1,1026 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-"""Tokenization classes for UDOP model."""
-
-import os
-from shutil import copyfile
-from typing import Optional, Union
-
-from ...tokenization_utils_base import (
-    BatchEncoding,
-    EncodedInput,
-    PreTokenizedInput,
-    TextInput,
-    TextInputPair,
-    TruncationStrategy,
-)
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_udop import UdopTokenizer
-else:
-    UdopTokenizer = None
-
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-logger = logging.get_logger(__name__)
-
-UDOP_ENCODE_KWARGS_DOCSTRING = r"""
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Activates and controls padding. Accepts the following values:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                Activates and controls truncation. Accepts the following values:
-
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  sequences (or a batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  greater than the model maximum admissible input size).
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-
-                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
-                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            stride (`int`, *optional*, defaults to 0):
-                If set to a number along with `max_length`, the overflowing tokens returned when
-                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
-                returned to provide some overlap between truncated and overflowing sequences. The value of this
-                argument defines the number of overlapping tokens.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
-                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
-            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            return_token_type_ids (`bool`, *optional*):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the `return_outputs` attribute.
-
-                [What are token type IDs?](../glossary#token-type-ids)
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the `return_outputs` attribute.
-
-                [What are attention masks?](../glossary#attention-mask)
-            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
-                of returning overflowing tokens.
-            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
-                Whether or not to return special tokens mask information.
-            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
-                Whether or not to return `(char_start, char_end)` for each token.
-
-                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
-                Python's tokenizer, this method will raise `NotImplementedError`.
-            return_length  (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the lengths of the encoded inputs.
-            verbose (`bool`, *optional*, defaults to `True`):
-                Whether or not to print more information and warnings.
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model.
-
-              [What are input IDs?](../glossary#input-ids)
-
-            - **bbox** -- List of bounding boxes to be fed to a model.
-
-            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
-              if *"token_type_ids"* is in `self.model_input_names`).
-
-              [What are token type IDs?](../glossary#token-type-ids)
-
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
-
-              [What are attention masks?](../glossary#attention-mask)
-
-            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when `return_length=True`).
-"""
-
-
-class UdopTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" UDOP tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
-    [`LayoutXLMTokenizer`] and [`T5Tokenizer`]. Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-
-        tokenizer_file (`str`, *optional*):
-            Path to the tokenizer file.
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
-            The bounding box to use for the special [SEP] token.
-        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
-            The bounding box to use for the special [PAD] token.
-        pad_token_label (`int`, *optional*, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
-            CrossEntropyLoss.
-        only_label_first_subword (`bool`, *optional*, defaults to `True`):
-            Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = UdopTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        eos_token="",
-        sep_token="",
-        unk_token="",
-        pad_token="",
-        sep_token_box=[1000, 1000, 1000, 1000],
-        pad_token_box=[0, 0, 0, 0],
-        pad_token_label=-100,
-        only_label_first_subword=True,
-        additional_special_tokens=None,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            sep_token_box=sep_token_box,
-            pad_token_box=pad_token_box,
-            pad_token_label=pad_token_label,
-            only_label_first_subword=only_label_first_subword,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-        # additional properties
-        self.sep_token_box = sep_token_box
-        self.pad_token_box = pad_token_box
-        self.pad_token_label = pad_token_label
-        self.only_label_first_subword = only_label_first_subword
-
-    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
-        text_pair_target: Optional[
-            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
-        ] = None,
-        **kwargs,
-    ) -> BatchEncoding:
-        if text is None and text_target is None:
-            raise ValueError("You need to specify either `text` or `text_target`.")
-        if text is not None:
-            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
-            # input mode in this case.
-            if not self._in_target_context_manager:
-                self._switch_to_input_mode()
-            encodings = self.call_boxes(text=text, text_pair=text_pair, boxes=boxes, word_labels=word_labels, **kwargs)
-        if text_target is not None:
-            self._switch_to_target_mode()
-            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **kwargs)
-        # Leave back tokenizer in input mode
-        self._switch_to_input_mode()
-
-        if text_target is None:
-            return encodings
-        elif text is None:
-            return target_encodings
-        else:
-            encodings["labels"] = target_encodings["input_ids"]
-            return encodings
-
-    @add_end_docstrings(UDOP_ENCODE_KWARGS_DOCSTRING)
-    def call_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
-        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
-        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences with word-level normalized bounding boxes and optional labels.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
-                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
-                words).
-            text_pair (`list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
-                (pretokenized string).
-            boxes (`list[list[int]]`, `list[list[list[int]]]`):
-                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (`list[int]`, `list[list[int]]`, *optional*):
-                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
-        """
-
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if text_pair is not None:
-            # in case text + text_pair are provided, text = questions, text_pair = words
-            if not _is_valid_text_input(text):
-                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
-            if not isinstance(text_pair, (list, tuple)):
-                raise ValueError(
-                    "words must of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-        else:
-            # in case only text is provided => must be words
-            if not isinstance(text, (list, tuple)):
-                raise ValueError(
-                    "Words must of type `list[str]` (single pretokenized example), "
-                    "or `list[list[str]]` (batch of pretokenized examples)."
-                )
-
-        if text_pair is not None:
-            is_batched = isinstance(text, (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-
-        words = text if text_pair is None else text_pair
-        if boxes is None:
-            raise ValueError("You must provide corresponding bounding boxes")
-        if is_batched:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide words and boxes for an equal amount of examples")
-            for words_example, boxes_example in zip(words, boxes):
-                if len(words_example) != len(boxes_example):
-                    raise ValueError("You must provide as many words as there are bounding boxes")
-        else:
-            if len(words) != len(boxes):
-                raise ValueError("You must provide as many words as there are bounding boxes")
-
-        if is_batched:
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            is_pair = bool(text_pair is not None)
-            return self.batch_encode_plus_boxes(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                is_pair=is_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus_boxes(
-                text=text,
-                text_pair=text_pair,
-                boxes=boxes,
-                word_labels=word_labels,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                **kwargs,
-            )
-
-    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast.tokenize
-    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        batched_input = [(text, pair)] if pair else [text]
-
-        self._tokenizer.encode_special_tokens = kwargs.pop(
-            "split_special_tokens", self._tokenizer.encode_special_tokens
-        )
-
-        encodings = self._tokenizer.encode_batch(
-            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
-        )
-
-        return encodings[0].tokens
-
-    def batch_encode_plus_boxes(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
-
-        
-
-        This method is deprecated, `__call__` should be used instead.
-
-        
-
-        Args:
-            batch_text_or_text_pairs (`list[str]`, `list[tuple[str, str]]`, `list[list[str]]`, `list[tuple[list[str], list[str]]]`, and for not-fast tokenizers, also `list[list[int]]`, `list[tuple[list[int], list[int]]]`):
-                Batch of sequences or pair of sequences to be encoded. This can be a list of
-                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
-                details in `encode_plus`).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._batch_encode_plus_boxes(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            is_pair=is_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    def _batch_encode_plus_boxes(
-        self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-        ],
-        is_pair: Optional[bool] = None,
-        boxes: Optional[list[list[list[int]]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        if not isinstance(batch_text_or_text_pairs, list):
-            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
-
-        # Set the truncation and padding strategy and restore the initial configuration
-        self.set_truncation_and_padding(
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-        )
-
-        if is_pair:
-            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
-
-        encodings = self._tokenizer.encode_batch(
-            batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
-        )
-
-        # Convert encoding to dict
-        # `Tokens` has type: tuple[
-        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
-        #                       list[EncodingFast]
-        #                    ]
-        # with nested dimensions corresponding to batch, overflows, sequence length
-        tokens_and_encodings = [
-            self._convert_encoding(
-                encoding=encoding,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=True
-                if word_labels is not None
-                else return_offsets_mapping,  # we use offsets to create the labels
-                return_length=return_length,
-                verbose=verbose,
-            )
-            for encoding in encodings
-        ]
-
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
-        sanitized_tokens = {}
-        for key in tokens_and_encodings[0][0]:
-            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
-            sanitized_tokens[key] = stack
-        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
-
-        # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
-        if return_overflowing_tokens:
-            overflow_to_sample_mapping = []
-            for i, (toks, _) in enumerate(tokens_and_encodings):
-                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
-            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-
-        for input_ids in sanitized_tokens["input_ids"]:
-            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-
-        # create the token boxes
-        token_boxes = []
-        for batch_index in range(len(sanitized_tokens["input_ids"])):
-            if return_overflowing_tokens:
-                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-            else:
-                original_index = batch_index
-            token_boxes_example = []
-            for id, sequence_id, word_id in zip(
-                sanitized_tokens["input_ids"][batch_index],
-                sanitized_encodings[batch_index].sequence_ids,
-                sanitized_encodings[batch_index].word_ids,
-            ):
-                if word_id is not None:
-                    if is_pair and sequence_id == 0:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        token_boxes_example.append(boxes[original_index][word_id])
-                else:
-                    if id == self.sep_token_id:
-                        token_boxes_example.append(self.sep_token_box)
-                    elif id == self.pad_token_id:
-                        token_boxes_example.append(self.pad_token_box)
-                    else:
-                        raise ValueError("Id not recognized")
-            token_boxes.append(token_boxes_example)
-
-        sanitized_tokens["bbox"] = token_boxes
-
-        # optionally, create the labels
-        if word_labels is not None:
-            labels = []
-            for batch_index in range(len(sanitized_tokens["input_ids"])):
-                if return_overflowing_tokens:
-                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
-                else:
-                    original_index = batch_index
-                labels_example = []
-                previous_token_empty = False
-                for id, offset, word_id in zip(
-                    sanitized_tokens["input_ids"][batch_index],
-                    sanitized_tokens["offset_mapping"][batch_index],
-                    sanitized_encodings[batch_index].word_ids,
-                ):
-                    if word_id is not None:
-                        if self.only_label_first_subword:
-                            if offset[0] == 0 and not previous_token_empty:
-                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
-                                labels_example.append(word_labels[original_index][word_id])
-                            else:
-                                labels_example.append(self.pad_token_label)
-                        else:
-                            labels_example.append(word_labels[original_index][word_id])
-                        if self.decode(id) == "":
-                            previous_token_empty = True
-                        else:
-                            previous_token_empty = False
-                    else:
-                        labels_example.append(self.pad_token_label)
-                labels.append(labels_example)
-
-            sanitized_tokens["labels"] = labels
-            # finally, remove offsets if the user didn't want them
-            if not return_offsets_mapping:
-                del sanitized_tokens["offset_mapping"]
-
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
-
-    def _encode_plus_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[bool] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        # make it a batched input
-        # 2 options:
-        # 1) only text, in case text must be a list of str
-        # 2) text + text_pair, in which case text = str and text_pair a list of str
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_boxes = [boxes]
-        batched_word_labels = [word_labels] if word_labels is not None else None
-        batched_output = self._batch_encode_plus_boxes(
-            batched_input,
-            is_pair=bool(text_pair is not None),
-            boxes=batched_boxes,
-            word_labels=batched_word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
-            batched_output = BatchEncoding(
-                {
-                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
-                    for key, value in batched_output.items()
-                },
-                batched_output.encodings,
-            )
-
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
-        return batched_output
-
-    def encode_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput, EncodedInput],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        **kwargs,
-    ) -> list[int]:
-        """
-        Args:
-        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. Same as doing
-        `self.convert_tokens_to_ids(self.tokenize(text))`.
-            text (`str`, `list[str]` or `list[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-        """
-        encoded_inputs = self.encode_plus_boxes(
-            text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            stride=stride,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
-
-        return encoded_inputs["input_ids"]
-
-    def encode_plus_boxes(
-        self,
-        text: Union[TextInput, PreTokenizedInput],
-        text_pair: Optional[PreTokenizedInput] = None,
-        boxes: Optional[list[list[int]]] = None,
-        word_labels: Optional[list[list[int]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences.
-
-        
-
-        This method is deprecated, `__call__` should be used instead.
-
-        
-
-        Args:
-            text (`str`, `list[str]` or (for non-fast tokenizers) `list[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus_boxes(
-            text=text,
-            text_pair=text_pair,
-            boxes=boxes,
-            word_labels=word_labels,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            **kwargs,
-        )
-
-    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast._pad
-    def _pad(
-        self,
-        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side (`str`, *optional*):
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if return_attention_mask and "attention_mask" not in encoded_inputs:
-            encoded_inputs["attention_mask"] = [1] * len(required_input)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-            padding_side = padding_side if padding_side is not None else self.padding_side
-            if padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
-            elif padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
-                if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if "bbox" in encoded_inputs:
-                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
-                if "labels" in encoded_inputs:
-                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
-                if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-            else:
-                raise ValueError("Invalid padding strategy:" + str(padding_side))
-
-        return encoded_inputs
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return token_ids_0 + [self.sep_token_id]
-        sep = [self.sep_token_id]
-        return token_ids_0 + sep + token_ids_1 + sep
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + sep) * [0]
-        return len(token_ids_0 + sep + token_ids_1 + sep) * [0]
-
-    # Copied from transformers.models.layoutxlm.tokenization_layoutxlm_fast.LayoutXLMTokenizerFast.save_vocabulary
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["UdopTokenizerFast"]
diff --git a/src/transformers/models/vits/tokenization_vits.py b/src/transformers/models/vits/tokenization_vits.py
index cf2d47bc4e88..3e78ed436349 100644
--- a/src/transformers/models/vits/tokenization_vits.py
+++ b/src/transformers/models/vits/tokenization_vits.py
@@ -19,7 +19,7 @@
 import re
 from typing import Any, Optional, Union
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import is_phonemizer_available, is_uroman_available, logging
 
 
@@ -100,6 +100,7 @@ def __init__(
             normalize=normalize,
             phonemize=phonemize,
             is_uroman=is_uroman,
+            special_tokens_pattern="none",
             **kwargs,
         )
 
@@ -222,7 +223,9 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
 
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
+        if token in self.encoder:
+            return self.encoder[token]
+        return self.unk_token_id
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
diff --git a/src/transformers/models/voxtral/convert_voxtral_weights_to_hf.py b/src/transformers/models/voxtral/convert_voxtral_weights_to_hf.py
index ab28526ba198..887794dd99b4 100644
--- a/src/transformers/models/voxtral/convert_voxtral_weights_to_hf.py
+++ b/src/transformers/models/voxtral/convert_voxtral_weights_to_hf.py
@@ -23,7 +23,7 @@
 from safetensors.torch import load_file
 
 from transformers import (
-    MistralCommonTokenizer,
+    MistralCommonBackend,
     VoxtralConfig,
     VoxtralForConditionalGeneration,
     VoxtralProcessor,
@@ -235,7 +235,7 @@ def write_model(
 
 
 def write_processor(input_path_or_repo: str, feature_extractor_path_or_repo: str, output_dir: str):
-    tokenizer = MistralCommonTokenizer.from_pretrained(input_path_or_repo)
+    tokenizer = MistralCommonBackend.from_pretrained(input_path_or_repo)
     feature_extractor = WhisperFeatureExtractor.from_pretrained(feature_extractor_path_or_repo)
 
     print("Creating the processor...")
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index f07904a89690..a6a28048de59 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -64,13 +64,13 @@ class VoxtralProcessorKwargs(ProcessingKwargs, total=False):
 class VoxtralProcessor(ProcessorMixin):
     r"""
     Constructs a Voxtral processor which wraps [`WhisperFeatureExtractor`] and
-    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
+    [`MistralCommonBackend`] into a single processor that inherits both the audio feature extraction and
     tokenizer functionalities.
 
     Args:
         feature_extractor ([`WhisperFeatureExtractor`]):
             The feature extractor is a required input.
-        tokenizer ([`MistralCommonTokenizer`]):
+        tokenizer ([`MistralCommonBackend`]):
             The tokenizer is a required input.
     """
 
@@ -107,8 +107,8 @@ def apply_chat_template(
         **kwargs: Unpack[AllKwargsForChatTemplate],
     ) -> str:
         """
-        This method applies the model's chat completion template given a conversation. It relies on MistralCommonTokenizer's
-        [`~MistralCommonTokenizer.apply_chat_template`] to prepare input ids to the model and on WhisperFeatureExtractor's
+        This method applies the model's chat completion template given a conversation. It relies on MistralCommonBackend's
+        [`~MistralCommonBackend.apply_chat_template`] to prepare input ids to the model and on WhisperFeatureExtractor's
         [`~WhisperFeatureExtractor.__call__`] to prepare input features to the model.
 
         Note that audio is padded to the nearest 30-second multiple prior to mel feature extraction.
@@ -233,7 +233,7 @@ def __call__(
     ):
         r"""
         Method to prepare text to be fed as input to the model. This method forwards the `text`
-        arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.__call__`] to encode
+        arguments to MistralCommonBackend's [`~MistralCommonBackend.__call__`] to encode
         the text. Please refer to the docstring of the above methods for more information.
         This methods does not support audio. To prepare the audio, please use:
         1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
@@ -282,7 +282,7 @@ def apply_transcription_request(
     ):
         """
         This method applies the model's transcription request template given a language and audio.
-        It relies on MistralCommonTokenizer and WhisperFeatureExtractor to prepare input ids and input features to the model.
+        It relies on MistralCommonBackend and WhisperFeatureExtractor to prepare input ids and input features to the model.
 
         ```python
         from transformers import VoxtralProcessor
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index e9f9ce04b1ba..104cdf92a099 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...tokenization_utils_base import AddedToken, BatchEncoding
 from ...utils import (
     ModelOutput,
@@ -177,9 +177,9 @@ def __init__(
             word_delimiter_token=word_delimiter_token,
             replace_word_delimiter_char=replace_word_delimiter_char,
             target_lang=target_lang,
+            special_tokens_pattern="none",
             **kwargs,
         )
-
         # make sure that tokens made of several
         # characters are not split at tokenization
         for token in self.encoder:
@@ -201,6 +201,11 @@ def set_target_lang(self, target_lang: str):
         self.encoder = self.vocab[target_lang]
         self.decoder = {v: k for k, v in self.encoder.items()}
 
+        # Remove conflicting entries from _added_tokens_decoder so vocabulary tokens take precedence
+        for token_id in list(self._added_tokens_decoder.keys()):
+            if token_id in self.decoder:
+                del self._added_tokens_decoder[token_id]
+
         # make sure that tokens made of several
         # characters are not split at tokenization
         for token in self.encoder:
@@ -273,6 +278,28 @@ def _convert_id_to_token(self, index: int) -> str:
         result = self.decoder.get(index, self.unk_token)
         return result
 
+    def convert_ids_to_tokens(
+        self, ids: Union[int, list[int]], skip_special_tokens: bool = False
+    ) -> Union[str, list[str]]:
+        """Overridden to prioritize vocabulary tokens over added tokens for nested vocabularies."""
+        if isinstance(ids, int):
+            if ids in self.decoder:
+                return self.decoder[ids]
+            return self._added_tokens_decoder[ids].content if ids in self._added_tokens_decoder else self.unk_token
+
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.decoder:
+                tokens.append(self.decoder[index])
+            elif index in self._added_tokens_decoder:
+                tokens.append(self._added_tokens_decoder[index].content)
+            else:
+                tokens.append(self.unk_token)
+        return tokens
+
     def convert_tokens_to_string(
         self,
         tokens: list[str],
@@ -336,6 +363,31 @@ def convert_tokens_to_string(
 
         return {"text": string, "char_offsets": char_offsets, "word_offsets": word_offsets}
 
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
+
+        Args:
+            out_string (`str`): The text to clean up.
+
+        Returns:
+            `str`: The cleaned-up string.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
     @staticmethod
     def _compute_offsets(
         char_repetitions: list[int], chars: list[str], ctc_token: int
@@ -406,13 +458,12 @@ def _decode(
         same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
         the whole token list and not individually on added tokens
         """
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        # Don't skip special tokens in convert_ids_to_tokens so we can handle word_delimiter_token specially
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
 
         result = []
         for token in filtered_tokens:
-            if skip_special_tokens and (
-                token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens)
-            ):
+            if skip_special_tokens and token in self.all_special_tokens and token != self.word_delimiter_token:
                 continue
             result.append(token)
 
@@ -863,6 +914,31 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
 
         return string
 
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
+
+        Args:
+            out_string (`str`): The text to clean up.
+
+        Returns:
+            `str`: The cleaned-up string.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
     def _decode(
         self,
         token_ids: list[int],
@@ -875,13 +951,12 @@ def _decode(
         same as tokens of the base vocabulary and therefore the function `convert_tokens_to_string` has to be called on
         the whole token list and not individually on added tokens
         """
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        # Don't skip special tokens in convert_ids_to_tokens so we can handle word_delimiter_token specially
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=False)
 
         result = []
         for token in filtered_tokens:
-            if skip_special_tokens and (
-                token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens)
-            ):
+            if skip_special_tokens and token in self.all_special_tokens and token != self.word_delimiter_token:
                 continue
             result.append(token)
 
diff --git a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
index 780dedd8ac27..fc6daaeee71d 100644
--- a/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
+++ b/src/transformers/models/wav2vec2_bert/convert_wav2vec2_seamless_checkpoint.py
@@ -150,7 +150,7 @@ def convert_wav2vec2_bert_checkpoint(
 
     # save feature extractor
     fe = SeamlessM4TFeatureExtractor(padding_value=1)
-    fe._set_processor_class("Wav2Vec2BertProcessor")
+    fe._processor_class = "Wav2Vec2BertProcessor"
     fe.save_pretrained(pytorch_dump_folder_path)
 
     if repo_id:
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index c819e63fd6cf..67eaf2239e6f 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -22,7 +22,7 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...tokenization_utils_base import AddedToken
 from ...utils import (
     ModelOutput,
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 71973334dfd6..3d8f451a2963 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -37,7 +37,7 @@
     from pyctcdecode import BeamSearchDecoderCTC
 
     from ...feature_extraction_utils import FeatureExtractionMixin
-    from ...tokenization_utils import PreTrainedTokenizerBase
+    from ...tokenization_python import PreTrainedTokenizerBase
 
 
 ListOfDict = list[dict[str, Union[int, str]]]
diff --git a/src/transformers/models/whisper/__init__.py b/src/transformers/models/whisper/__init__.py
index ec73ac2b8fe9..50aec31b3e9f 100644
--- a/src/transformers/models/whisper/__init__.py
+++ b/src/transformers/models/whisper/__init__.py
@@ -23,7 +23,6 @@
     from .modeling_whisper import *
     from .processing_whisper import *
     from .tokenization_whisper import *
-    from .tokenization_whisper_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py
index d2a9914aa822..889fab69af74 100644
--- a/src/transformers/models/whisper/tokenization_whisper.py
+++ b/src/transformers/models/whisper/tokenization_whisper.py
@@ -16,17 +16,22 @@
 
 import json
 import os
+import re
+import warnings
 from functools import lru_cache
-from typing import Optional, Union
+from typing import Optional
 
 import numpy as np
-import regex as re
+from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
 from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
 
 
+logger = logging.get_logger(__name__)
+
 VOCAB_FILES_NAMES = {
     "vocab_file": "vocab.json",
     "tokenizer_file": "tokenizer.json",
@@ -35,54 +40,6 @@
 }
 
 
-MAX_MODEL_INPUT_SIZES = {
-    "openai/whisper-base": 448,
-}
-
-
-# Copied from transformers.models.gpt2.tokenization_gpt2.bytes_to_unicode
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
-    characters the bpe code barfs on.
-
-    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
-    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
-    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
-    tables between utf-8 bytes and unicode strings.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-logger = logging.get_logger(__name__)
-
-
-# Copied from transformers.models.gpt2.tokenization_gpt2.get_pairs
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
 LANGUAGES = {
     "en": "english",
     "zh": "chinese",
@@ -206,23 +163,23 @@ def get_pairs(word):
 TASK_IDS = ["translate", "transcribe"]
 
 
-class WhisperTokenizer(PreTrainedTokenizer):
+class WhisperTokenizer(TokenizersBackend):
     """
-    Construct a Whisper tokenizer.
+    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
-    the superclass for more information regarding such methods.
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
+        vocab_file (`str`, *optional*):
             Path to the vocabulary file.
-        merges_file (`str`):
+        merges_file (`str`, *optional*):
             Path to the merges file.
         normalizer_file (`str`, *optional*):
             Path to the normalizer_file file.
-        errors (`str`, *optional*, defaults to `"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See
-            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
         unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
@@ -231,11 +188,9 @@ class WhisperTokenizer(PreTrainedTokenizer):
             `"<|startoftranscript|>"` when generating.
         eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
             The end of sequence token.
-        pad_token (`str`, *optional*):
-            The token used for padding, for example when batching sequences of different lengths.
         add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word.
+            other word. (Whisper tokenizer detect beginning of words by the preceding space).
         language (`str`, *optional*):
             The language of the transcription text. The corresponding language id token is appended to the start of the
             sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
@@ -252,14 +207,12 @@ class WhisperTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
-        merges_file,
+        vocab=None,
+        merges=None,
         normalizer_file=None,
-        errors="replace",
         unk_token="<|endoftext|>",
         bos_token="<|endoftext|>",
         eos_token="<|endoftext|>",
-        pad_token=None,
         add_prefix_space=False,
         language=None,
         task=None,
@@ -281,24 +234,36 @@ def __init__(
             if isinstance(unk_token, str)
             else unk_token
         )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False, normalized=False, special=True)
-            if isinstance(pad_token, str)
-            else pad_token
+
+        self._vocab = vocab if vocab is not None else {}
+        self._merges = merges if merges is not None else []
+
+        self._tokenizer = Tokenizer(
+            BPE(
+                vocab=self._vocab,
+                merges=self._merges,
+                dropout=None,
+                continuing_subword_prefix="",
+                end_of_word_suffix="",
+                fuse_unk=False,
+            )
         )
 
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            bpe_merges = merges_handle.read().split("\n")[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        self.add_prefix_space = add_prefix_space
+        self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        super().__init__(
+            tokenizer_object=self._tokenizer,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            normalizer_file=normalizer_file,
+            language=language,
+            task=task,
+            predict_timestamps=predict_timestamps,
+            **kwargs,
+        )
 
         if normalizer_file is not None:
             with open(normalizer_file, encoding="utf-8") as vocab_handle:
@@ -306,215 +271,25 @@ def __init__(
         else:
             self.english_spelling_normalizer = None
 
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
         self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
 
         self.language = language
-        super().__init__(
-            errors=errors,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            pad_token=pad_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
         self.task = task
         self.predict_timestamps = predict_timestamps
 
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.bpe with GPT2 -> Whisper
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                except ValueError:
-                    new_word.extend(word[i:])
-                    break
-                else:
-                    new_word.extend(word[i:j])
-                    i = j
-
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-
-    def set_prefix_tokens(
-        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
-    ):
-        """
-        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
-        update the prefix tokens as required when fine-tuning. Example:
-
-        ```python
-        >>> # instantiate the tokenizer and set the prefix token to Spanish
-        >>> tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny", language="spanish")
-        >>> # now switch the prefix token from Spanish to French
-        >>> tokenizer.set_prefix_tokens(language="french")
-        ```
-
-        Args:
-            language (`str`, *optional*, defaults to `None`):
-                The language of the transcription text.
-            task (`str`, *optional*, defaults to `None`):
-                Task identifier to append at the start of sequence (if any).
-            predict_timestamps (`bool`, *optional*, defaults to `None`):
-                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
-        """
-        self.language = language if language is not None else self.language
-        self.task = task if task is not None else self.task
-        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
+        self._post_init()
 
-    @property
-    def prefix_tokens(self) -> list[int]:
-        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
-        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
-        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
-        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
-        langs = tuple(LANGUAGES.keys())
-
-        if self.language is not None:
-            self.language = self.language.lower()
-            if self.language in TO_LANGUAGE_CODE:
-                language_id = TO_LANGUAGE_CODE[self.language]
-            elif self.language in TO_LANGUAGE_CODE.values():
-                language_id = self.language
-            else:
-                is_language_code = len(self.language) == 2
-                raise ValueError(
-                    f"Unsupported language: {self.language}. Language should be one of:"
-                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+    def _post_init(self):
+        """Post-initialization hook to set up prefix tokens after the tokenizer is fully loaded."""
+        super()._post_init()
+        # Set up prefix tokens if language or task is specified (may be set from config in from_pretrained)
+        if hasattr(self, "language") and hasattr(self, "task") and hasattr(self, "predict_timestamps"):
+            if self.language is not None or self.task is not None:
+                self.set_prefix_tokens(
+                    language=self.language, task=self.task, predict_timestamps=self.predict_timestamps
                 )
 
-        if self.task is not None:
-            if self.task not in TASK_IDS:
-                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
-
-        bos_sequence = [bos_token_id]
-        if self.language is not None:
-            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
-        if self.task is not None:
-            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
-        if not self.predict_timestamps:
-            bos_sequence.append(notimestamps_token_id)
-        return bos_sequence
-
-    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
-        """Build model inputs from a sequence by appending eos_token_id."""
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    # Copied from transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1]
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._tokenize with GPT2 -> Whisper
-    def _tokenize(self, text):
-        """Tokenize a string."""
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(
-                self.byte_encoder[b] for b in token.encode("utf-8")
-            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
-        return bpe_tokens
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer._convert_token_to_id with GPT2 -> Whisper
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """
-        Converts an index (integer) in a token (str) using the vocab. Whisper's base tokenizer always decodes OOV
-        tokens as "", thus we do not use the `unk_token` here.
-        """
-        return self.decoder.get(index, "")
-
-    def normalize(self, text):
-        """
-        Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on
-        english text.
-        """
-        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
-        return normalizer(text)
-
-    @staticmethod
-    def basic_normalize(text, remove_diacritics=False):
-        """
-        Normalize a given string using the `BasicTextNormalizer` class, which performs commons transformation on
-        multilingual text.
-        """
-        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
-        return normalizer(text)
-
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
     def _decode_with_timestamps(
         self, token_ids, skip_special_tokens=False, time_precision=0.02, segment_size=1500
     ) -> str:
@@ -552,11 +327,18 @@ def _decode_with_timestamps(
                 outputs.append([])
             else:
                 outputs[-1].append(token)
-        outputs = [
-            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
-        ]
-        return "".join(outputs)
+        # Decode token sequences outside list comprehension to avoid super() resolution issues
+        decoded_outputs = []
+        for s in outputs:
+            if isinstance(s, str):
+                decoded_outputs.append(s)
+            elif s:
+                decoded_outputs.append(super().decode(s, skip_special_tokens=skip_special_tokens))
+            else:
+                decoded_outputs.append("")
+        return "".join(decoded_outputs)
 
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
     def _compute_offsets(self, token_ids, time_precision=0.02, segment_size=1500):
         """
         Compute offsets for a given tokenized input
@@ -626,6 +408,7 @@ def _compute_offsets(self, token_ids, time_precision=0.02, segment_size=1500):
         return offsets
 
     @lru_cache
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
     def timestamp_ids(self, time_precision=0.02):
         """
         Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
@@ -636,6 +419,7 @@ def timestamp_ids(self, time_precision=0.02):
         """
         return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
 
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids
     def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
         """
         Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
@@ -654,9 +438,11 @@ def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
 
         return token_ids
 
-    def _filter_timestamp_ids(self, token_ids):
-        return re.sub(self.timestamp_pat, "", token_ids)
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids
+    def _filter_timestamp_ids(self, text):
+        return re.sub(self.timestamp_pat, "", text)
 
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode
     def decode(
         self,
         token_ids,
@@ -727,7 +513,11 @@ def decode(
                 filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
             )
         else:
-            text = self._filter_timestamp_ids(text)
+            # Handle both single string and batch (list of strings) outputs
+            if isinstance(text, list):
+                text = [self._filter_timestamp_ids(t) for t in text]
+            else:
+                text = self._filter_timestamp_ids(text)
 
         # retrieve offsets
         if output_offsets:
@@ -736,57 +526,59 @@ def decode(
         return text
 
     def _decode(
-        self,
-        token_ids: Union[int, list[int]],
-        skip_special_tokens: bool = False,
-        normalize: bool = False,
-        basic_normalize: bool = False,
-        remove_diacritics: bool = False,
-        **kwargs,
+        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
     ) -> str:
-        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separately for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
-                continue
-            if token in self.added_tokens_encoder:
-                if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
-        text = "".join(sub_texts)
+        text = super()._decode(*args, **kwargs)
 
         if normalize:
-            clean_text = self.normalize(text)
+            clean_text = self._normalize(text)
             return clean_text
         elif basic_normalize:
-            clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics)
+            clean_text = self._basic_normalize(text, remove_diacritics=remove_diacritics)
             return clean_text
         else:
             return text
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.convert_tokens_to_string with GPT2 -> Whisper
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        text = "".join(tokens)
-        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
-        return text
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._normalize
+    def _normalize(self, text):
+        warnings.warn(
+            "The private method `_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper English normalizer using the `normalize` method."
+        )
+        return self.normalize(text)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._basic_normalize
+    def _basic_normalize(self, text, remove_diacritics=False):
+        warnings.warn(
+            "The private method `_basic_normalize` is deprecated and will be removed in v5 of Transformers."
+            "You can normalize an input string using the Whisper basic normalizer using the `basic_normalize` method."
+        )
+        return self.basic_normalize(text, remove_diacritics=remove_diacritics)
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
+    def normalize(self, text):
+        """
+        Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on
+        english text.
+        """
+        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
+        return normalizer(text)
+
+    @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
+    def basic_normalize(text, remove_diacritics=False):
+        """
+        Normalize a given string using the `BasicTextNormalizer` class, which performs commons transformation on
+        multilingual text.
+        """
+        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
+        return normalizer(text)
 
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
         if not os.path.isdir(save_directory):
             logger.error(f"Vocabulary path ({save_directory}) should be a directory")
             return
+
         vocab_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         )
@@ -798,20 +590,11 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
         )
 
         with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+            f.write(json.dumps(self._vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
 
-        index = 0
         with open(merge_file, "w", encoding="utf-8") as writer:
             writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!"
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
+            writer.writelines(" ".join(merge_pair) + "\n" for merge_pair in self._merges)
 
         if self.english_spelling_normalizer is not None:
             with open(normalizer_file, "w", encoding="utf-8") as f:
@@ -819,15 +602,123 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
                     json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
                 )
 
-        return vocab_file, merge_file, normalizer_file
+        return (vocab_file, merge_file, normalizer_file)
+
+    def set_prefix_tokens(
+        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
+    ):
+        """
+        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
+        update the prefix tokens as required when fine-tuning. Example:
+
+        ```python
+        >>> # instantiate the tokenizer and set the prefix token to Spanish
+        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
+        >>> # now switch the prefix token from Spanish to French
+        >>> tokenizer.set_prefix_tokens(language="french")
+        ```
+
+        Args:
+            language (`str`, *optional*, defaults to `None`):
+                The language of the transcription text.
+            task (`str`, *optional*, defaults to `None`):
+                Task identifier to append at the start of sequence (if any).
+            predict_timestamps (`bool`, *optional*, defaults to `None`):
+                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
+        """
+        self.language = language if language is not None else self.language
+        self.task = task if task is not None else self.task
+        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
+
+        prefix_token_ids = self.prefix_tokens
+        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        prefix_template = " ".join([f"{token}:0" for token in prefixes])
+        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{prefix_template} $A:0 {eos}:0",
+            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
+            special_tokens=[
+                (eos, eos_token_id),
+                *zip(prefixes, prefix_token_ids),
+            ],
+        )
+
+    @property
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
+    def prefix_tokens(self) -> list[int]:
+        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
+        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
+        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
+        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
+        langs = tuple(LANGUAGES.keys())
+
+        if self.language is not None:
+            self.language = self.language.lower()
+            if self.language in TO_LANGUAGE_CODE:
+                language_id = TO_LANGUAGE_CODE[self.language]
+            elif self.language in TO_LANGUAGE_CODE.values():
+                language_id = self.language
+            else:
+                is_language_code = len(self.language) == 2
+                raise ValueError(
+                    f"Unsupported language: {self.language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+
+        if self.task is not None:
+            if self.task not in TASK_IDS:
+                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
+
+        bos_sequence = [bos_token_id]
+        if self.language is not None:
+            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
+        if self.task is not None:
+            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
+        if not self.predict_timestamps:
+            bos_sequence.append(notimestamps_token_id)
+        return bos_sequence
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
+        """Build model inputs from a sequence by appending eos_token_id."""
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
 
-    # Copied from transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer.prepare_for_tokenization with GPT2 -> Whisper
-    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
-        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if is_split_into_words or add_prefix_space:
-            text = " " + text
-        return (text, kwargs)
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
 
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1]
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
         # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
@@ -847,6 +738,7 @@ def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time
             time_precision=time_precision,
         )
 
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
     def get_prompt_ids(self, text: str, return_tensors="np"):
         """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
         batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
@@ -861,6 +753,7 @@ def get_prompt_ids(self, text: str, return_tensors="np"):
         batch_encoding.convert_to_tensors(tensor_type=return_tensors)
         return batch_encoding["input_ids"]
 
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
     def _strip_prompt(self, token_ids: list[int], prompt_token_id: int, decoder_start_token_id: int):
         if not isinstance(token_ids, list):
             token_ids = self._convert_to_list(token_ids)
@@ -880,6 +773,7 @@ def _strip_prompt(self, token_ids: list[int], prompt_token_id: int, decoder_star
         return token_ids
 
     @staticmethod
+    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._convert_to_list
     def _convert_to_list(token_ids):
         # convert type to ndarray if necessary
         if hasattr(token_ids, "numpy"):
@@ -890,6 +784,152 @@ def _convert_to_list(token_ids):
         return token_ids
 
 
+def _combine_tokens_into_words(
+    tokenizer,
+    tokens: list[int],
+    language: Optional[str] = None,
+    prepend_punctuations: str = "\"'“¡¿([{-",
+    append_punctuations: str = "\"'.。,,!!??::”)]}、",
+):
+    """
+    Groups tokens by word. Returns a tuple containing a list of strings with the words, and a list of `token_id`
+    sequences with the tokens making up each word.
+    """
+    if language is None:
+        language = tokenizer.language
+    if language is None:
+        language = "english"
+
+    if language in {"chinese", "japanese", "thai", "lao", "myanmar", "cantonese"}:
+        # These languages don't typically use spaces.
+        words, word_tokens, token_indices = _split_tokens_on_unicode(tokenizer, tokens)
+    else:
+        words, word_tokens, token_indices = _split_tokens_on_spaces(tokenizer, tokens)
+
+    _merge_punctuations(words, word_tokens, token_indices, prepend_punctuations, append_punctuations)
+    return words, word_tokens, token_indices
+
+
+def _find_longest_common_sequence(sequences, token_timestamp_sequences=None):
+    # It would be much harder to do O(n) because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # If token_timestamp_sequences is provided, will split those sequences in
+    # exactly the same way.
+
+    left_sequence = sequences[0]
+    left_length = len(left_sequence)
+    total_sequence = []
+
+    if token_timestamp_sequences:
+        left_token_timestamp_sequence = token_timestamp_sequences[0]
+        total_token_timestamp_sequence = []
+
+    for seq_idx, right_sequence in enumerate(sequences[1:]):
+        # index = 0
+        max_ = 0.0
+        max_indices = (left_length, left_length, 0, 0)
+        # Here we're sliding matches
+        # [a, b, c, d]
+        #          [c, d, f]
+        # =        [c] == [d]
+        #
+        # [a, b, c, d]
+        #       [c, d, f]
+        # =     [c, d] == [c, d]
+        #
+        #
+        # [a, b, c, d]
+        #    [c, d, f]
+        #
+        # =  [b, c, d] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [c, d, f]
+        #
+        # [a, b, c] == [c, d, f]
+        #
+        # [a, b, c, d]
+        # [d, f]
+        #
+        # [a, b] == [d, f]
+        #
+        # [a, b, c, d]
+        # [f]
+        #
+        # [a] == [f]
+        right_length = len(right_sequence)
+        for i in range(1, left_length + right_length):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+
+            # Slightly convoluted because we don't want out of bound indices
+            # This will be necessary for a small conflict resolution optimization
+            # later
+            left_start = max(0, left_length - i)
+            left_stop = min(left_length, left_length + right_length - i)
+            left = np.array(left_sequence[left_start:left_stop])
+
+            right_start = max(0, i - left_length)
+            right_stop = min(right_length, i)
+            right = np.array(right_sequence[right_start:right_stop])
+
+            # We can only match subsequences of the same size.
+            if len(left) != len(right):
+                raise RuntimeError(
+                    "There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference."
+                )
+
+            if token_timestamp_sequences:
+                # Get length of longest subsequence of tokens that match
+                # and have timestamps that are in order
+                matches = sum(
+                    1
+                    for idx, elem in enumerate(left)
+                    if (
+                        elem == right[idx]
+                        and left_token_timestamp_sequence[left_start + idx]
+                        <= token_timestamp_sequences[seq_idx + 1][right_start + idx]
+                    )
+                )
+
+            else:
+                matches = np.sum(left == right)
+
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                max_ = matching
+                max_indices = (left_start, left_stop, right_start, right_stop)
+
+        (left_start, left_stop, right_start, right_stop) = max_indices
+
+        # This is a small conflict optimization since those sequences overlap
+        # in audio.
+        # We're going to give more confidence to the left sequence
+        # for the left of the overlap,
+        # and to the right of the sequence, for the right of the overlap
+        left_mid = (left_stop + left_start) // 2
+        right_mid = (right_stop + right_start) // 2
+        total_sequence.extend(left_sequence[:left_mid])
+        left_sequence = right_sequence[right_mid:]
+        left_length = len(left_sequence)
+
+        if token_timestamp_sequences:
+            total_token_timestamp_sequence.extend(left_token_timestamp_sequence[:left_mid])
+            left_token_timestamp_sequence = token_timestamp_sequences[seq_idx + 1][right_mid:]
+
+    total_sequence.extend(left_sequence)
+
+    if token_timestamp_sequences is None:
+        return total_sequence
+
+    if len(token_timestamp_sequences) > 0:
+        total_token_timestamp_sequence.extend(left_token_timestamp_sequence)
+        return total_sequence, total_token_timestamp_sequence
+    else:
+        return total_sequence, []
+
+
 def _decode_asr(tokenizer, model_outputs, *, return_timestamps, return_language, time_precision, segment_size=1500):
     """
     Internal method meant to only be used by asr pipeline. Handles all the little quirks specific to whisper to handle
diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py
deleted file mode 100644
index 904f099243f9..000000000000
--- a/src/transformers/models/whisper/tokenization_whisper_fast.py
+++ /dev/null
@@ -1,617 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for Whisper."""
-
-import json
-import os
-import re
-from functools import lru_cache
-from typing import Optional
-
-import numpy as np
-from tokenizers import AddedToken, processors
-
-from ...tokenization_utils_base import BatchEncoding
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import logging
-from .english_normalizer import BasicTextNormalizer, EnglishTextNormalizer
-from .tokenization_whisper import LANGUAGES, TASK_IDS, TO_LANGUAGE_CODE, WhisperTokenizer, _decode_asr
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "tokenizer_file": "tokenizer.json",
-    "merges_file": "merges.txt",
-    "normalizer_file": "normalizer.json",
-}
-
-
-class WhisperTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" Whisper tokenizer (backed by HuggingFace's *tokenizers* library).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`, *optional*):
-            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
-            Path to the merges file.
-        normalizer_file (`str`, *optional*):
-            Path to the normalizer_file file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
-            `"<|startoftranscript|>"` when generating.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-            The end of sequence token.
-        add_prefix_space (`bool`, *optional*, defaults to `False`):
-            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
-            other word. (Whisper tokenizer detect beginning of words by the preceding space).
-        language (`str`, *optional*):
-            The language of the transcription text. The corresponding language id token is appended to the start of the
-            sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
-            `"<|es|>"` is appended to the start of sequence. This should be used for multilingual fine-tuning only.
-        task (`str`, *optional*):
-            Task identifier to append at the start of sequence (if any). This should be used for mulitlingual
-            fine-tuning, with `"transcribe"` for speech recognition and `"translate"` for speech translation.
-        predict_timestamps (`bool`, *optional*, defaults to `False`):
-            Whether to omit the `<|notimestamps|>` token at the start of the sequence.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = WhisperTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        merges_file=None,
-        normalizer_file=None,
-        tokenizer_file=None,
-        unk_token="<|endoftext|>",
-        bos_token="<|endoftext|>",
-        eos_token="<|endoftext|>",
-        add_prefix_space=False,
-        language=None,
-        task=None,
-        predict_timestamps=False,
-        **kwargs,
-    ):
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False, normalized=False, special=True)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False, normalized=False, special=True)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False, normalized=False, special=True)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-
-        super().__init__(
-            vocab_file,
-            merges_file,
-            tokenizer_file=tokenizer_file,
-            unk_token=unk_token,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            add_prefix_space=add_prefix_space,
-            **kwargs,
-        )
-
-        self.add_bos_token = kwargs.pop("add_bos_token", False)
-
-        if normalizer_file is not None:
-            with open(normalizer_file, encoding="utf-8") as vocab_handle:
-                self.english_spelling_normalizer = json.load(vocab_handle)
-        else:
-            self.english_spelling_normalizer = None
-
-        self.timestamp_pat = re.compile(r"<\|(\d+\.\d+)\|>")
-
-        self.language = language
-        self.task = task
-        self.predict_timestamps = predict_timestamps
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._batch_encode_plus
-    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._batch_encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast._encode_plus
-    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        is_split_into_words = kwargs.get("is_split_into_words", False)
-
-        assert self.add_prefix_space or not is_split_into_words, (
-            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
-            "to use it with pretokenized inputs."
-        )
-
-        return super()._encode_plus(*args, **kwargs)
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._decode_with_timestamps
-    def _decode_with_timestamps(
-        self, token_ids, skip_special_tokens=False, time_precision=0.02, segment_size=1500
-    ) -> str:
-        """
-        Timestamp tokens are above the special tokens' id range and are ignored by `decode()`. This method decodes
-        given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
-        """
-        timestamp_begin = self.all_special_ids[-1] + 1
-        outputs = [[]]
-
-        cur_max_timestamp = 0.0
-        prev_segments_len = 0.0
-        penultimate_timestamp = 0.0
-
-        for i, token in enumerate(token_ids):
-            if token >= timestamp_begin:
-                timestamp = float((token - timestamp_begin) * time_precision)
-
-                if timestamp < cur_max_timestamp:
-                    # next segment has started
-                    last_was_single_ending = i >= 2 and not (
-                        token_ids[i - 1] >= timestamp_begin and token_ids[i - 2] >= timestamp_begin
-                    )
-                    if last_was_single_ending:
-                        prev_segments_len += time_precision * segment_size
-                    else:
-                        cur_max_timestamp = penultimate_timestamp
-                        prev_segments_len += penultimate_timestamp
-                        outputs = outputs[:-2]
-
-                penultimate_timestamp = cur_max_timestamp
-                cur_max_timestamp = timestamp
-
-                outputs.append(f"<|{(timestamp + prev_segments_len):.2f}|>")
-                outputs.append([])
-            else:
-                outputs[-1].append(token)
-        outputs = [
-            s if isinstance(s, str) else self.decode(s, skip_special_tokens=skip_special_tokens) for s in outputs
-        ]
-        return "".join(outputs)
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._compute_offsets
-    def _compute_offsets(self, token_ids, time_precision=0.02, segment_size=1500):
-        """
-        Compute offsets for a given tokenized input
-
-        Args:
-            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
-                List of tokenized input ids. Can be obtained using the `__call__` method.
-            time_precision (`float`, *optional*, defaults to 0.02):
-                The time ratio to convert from token to time.
-            segment_size (`int`, *optional*, defaults to 1500):
-                The number of features in the input mel spectrogram.
-        """
-        offsets = []
-        # ensure torch tensor of token ids is placed on cpu
-        if "torch" in str(type(token_ids)) and (hasattr(token_ids, "cpu") and callable(token_ids.cpu)):
-            token_ids = token_ids.cpu()
-        token_ids = np.array(token_ids)
-        if token_ids.shape[0] > 1 and len(token_ids.shape) > 1:
-            raise ValueError("Can only process a single input at a time")
-        timestamp_begin = self.all_special_ids[-1] + 1
-        timestamp_tokens = token_ids >= timestamp_begin
-
-        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
-        if consecutive.shape[0] == 0 and timestamp_tokens.sum() <= 1:
-            # either there are no timestamps or there are no consecutive ones
-            return []
-        elif np.where(timestamp_tokens)[0][-1] + 1 not in consecutive:
-            # we add the final timestamp if it is not already in the list
-            consecutive = np.append(consecutive, np.where(timestamp_tokens)[0][-1] + 1)
-
-        last_slice = np.where(timestamp_tokens)[0][0]
-        cur_max_timestamp = 0
-        prev_segments_len = 0
-        for current_slice in consecutive:
-            sliced_tokens = token_ids[last_slice:current_slice]
-            if len(sliced_tokens) > 1:
-                start_timestamp_position = sliced_tokens[0].item() - timestamp_begin
-                end_timestamp_position = sliced_tokens[-1].item() - timestamp_begin
-
-                if start_timestamp_position < cur_max_timestamp:
-                    # next segment has started
-                    is_single_ending = last_slice >= 2 and not (
-                        token_ids[last_slice - 2] >= timestamp_begin and token_ids[last_slice - 1] >= timestamp_begin
-                    )
-                    if is_single_ending:
-                        prev_segments_len += segment_size
-                    else:
-                        prev_segments_len += cur_max_timestamp
-
-                cur_max_timestamp = end_timestamp_position
-
-                # strip timestamp tokens from the text output
-                sliced_tokens = self._preprocess_token_ids(sliced_tokens)
-                text = self._decode(sliced_tokens)
-                text = self._filter_timestamp_ids(text)
-                offsets.append(
-                    {
-                        "text": text,
-                        "timestamp": (
-                            start_timestamp_position * time_precision + prev_segments_len * time_precision,
-                            end_timestamp_position * time_precision + prev_segments_len * time_precision,
-                        ),
-                    }
-                )
-            last_slice = current_slice
-
-        return offsets
-
-    @lru_cache
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.timestamp_ids
-    def timestamp_ids(self, time_precision=0.02):
-        """
-        Compute the timestamp token ids for a given precision and save to least-recently used (LRU) cache.
-
-        Args:
-            time_precision (`float`, *optional*, defaults to 0.02):
-                The time ratio to convert from token to time.
-        """
-        return self.convert_tokens_to_ids([("<|%.2f|>" % (i * time_precision)) for i in range(1500 + 1)])
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._preprocess_token_ids
-    def _preprocess_token_ids(self, token_ids, skip_special_tokens: bool = False):
-        """
-        Pre-process the token ids for decoding by removing the prompt tokens ids and timestamp token ids.
-
-        Args:
-            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
-                List of tokenized input ids. Typically, obtained using the `__call__` method of the tokenizer.
-            skip_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove special tokens from the token ids. If `True`, the prompt token ids will be
-                removed.
-        """
-        if skip_special_tokens:
-            prompt_token_id = self.convert_tokens_to_ids("<|startofprev|>")
-            decoder_start_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
-            token_ids = self._strip_prompt(token_ids, prompt_token_id, decoder_start_token_id)
-
-        return token_ids
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._filter_timestamp_ids
-    def _filter_timestamp_ids(self, token_ids):
-        return re.sub(self.timestamp_pat, "", token_ids)
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.decode
-    def decode(
-        self,
-        token_ids,
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        output_offsets: bool = False,
-        time_precision: float = 0.02,
-        decode_with_timestamps: bool = False,
-        normalize: bool = False,
-        basic_normalize: bool = False,
-        remove_diacritics: bool = False,
-        **kwargs,
-    ) -> str:
-        """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
-        tokens and clean up tokenization spaces.
-
-        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
-
-        Args:
-            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
-                List of tokenized input ids. Can be obtained using the `__call__` method.
-            skip_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove special tokens in the decoding. Will remove the previous tokens (pre-prompt)
-                if present.
-            clean_up_tokenization_spaces (`bool`, *optional*):
-                Whether or not to clean up the tokenization spaces. If `None`, will default to
-                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
-            output_offsets (`bool`, *optional*, defaults to `False`):
-                Whether or not to output the offsets of the tokens. This should only be set if the model predicted
-                timestamps. If there are previous tokens (pre-prompt) to decode, they will only appear in the decoded
-                text if they contain timestamp tokens.
-            time_precision (`float`, *optional*, defaults to 0.02):
-                The time ratio to convert from token to time.
-            decode_with_timestamps (`bool`, *optional*, defaults to `False`):
-                Whether or not to decode with timestamps included in the raw text.
-            normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to apply the English text normalizer to the decoded text. Only applicable when the
-                target text is in English. Otherwise, the basic text normalizer should be applied.
-            basic_normalize (`bool`, *optional*, defaults to `False`):
-                Whether or not to apply the Basic text normalizer to the decoded text. Applicable to multilingual
-                target text.
-            remove_diacritics (`bool`, *optional*, defaults to `False`):
-                Whether or not to remove diacritics when applying the Basic text normalizer. Removing diacritics may
-                destroy information in the decoded text, hence it should be used with caution.
-            kwargs (additional keyword arguments, *optional*):
-                Will be passed to the underlying model specific decode method.
-        Returns:
-            `str`: The decoded sentence.
-        """
-        filtered_ids = self._preprocess_token_ids(
-            token_ids,
-            skip_special_tokens=skip_special_tokens,
-        )
-
-        text = super().decode(
-            filtered_ids,
-            skip_special_tokens=skip_special_tokens,
-            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-            normalize=normalize,
-            basic_normalize=basic_normalize,
-            remove_diacritics=remove_diacritics,
-            **kwargs,
-        )
-        if decode_with_timestamps:
-            # legacy method to decode timestamps when not included in the tokenizer vocabulary
-            text = self._decode_with_timestamps(
-                filtered_ids, time_precision=time_precision, skip_special_tokens=skip_special_tokens
-            )
-        else:
-            text = self._filter_timestamp_ids(text)
-
-        # retrieve offsets
-        if output_offsets:
-            offsets = self._compute_offsets(token_ids, time_precision=time_precision)
-            return {"text": text, "offsets": offsets}
-        return text
-
-    def _decode(
-        self, *args, normalize: bool = False, basic_normalize: bool = False, remove_diacritics: bool = False, **kwargs
-    ) -> str:
-        text = super()._decode(*args, **kwargs)
-
-        if normalize:
-            clean_text = self.normalize(text)
-            return clean_text
-        elif basic_normalize:
-            clean_text = self.basic_normalize(text, remove_diacritics=remove_diacritics)
-            return clean_text
-        else:
-            return text
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.normalize
-    def normalize(self, text):
-        """
-        Normalize a given string using the `EnglishTextNormalizer` class, which performs commons transformation on
-        english text.
-        """
-        normalizer = EnglishTextNormalizer(self.english_spelling_normalizer)
-        return normalizer(text)
-
-    @staticmethod
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.basic_normalize
-    def basic_normalize(text, remove_diacritics=False):
-        """
-        Normalize a given string using the `BasicTextNormalizer` class, which performs commons transformation on
-        multilingual text.
-        """
-        normalizer = BasicTextNormalizer(remove_diacritics=remove_diacritics)
-        return normalizer(text)
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
-
-        normalizer_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["normalizer_file"]
-        )
-
-        if self.english_spelling_normalizer is not None:
-            with open(normalizer_file, "w", encoding="utf-8") as f:
-                f.write(
-                    json.dumps(self.english_spelling_normalizer, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
-                )
-
-        return tuple(files) + (normalizer_file,)
-
-    def set_prefix_tokens(
-        self, language: Optional[str] = None, task: Optional[str] = None, predict_timestamps: Optional[bool] = None
-    ):
-        """
-        Override the prefix tokens appended to the start of the label sequence. This method can be used standalone to
-        update the prefix tokens as required when fine-tuning. Example:
-
-        ```python
-        >>> # instantiate the tokenizer and set the prefix token to Spanish
-        >>> tokenizer = WhisperTokenizerFast.from_pretrained("openai/whisper-tiny", language="spanish")
-        >>> # now switch the prefix token from Spanish to French
-        >>> tokenizer.set_prefix_tokens(language="french")
-        ```
-
-        Args:
-            language (`str`, *optional*, defaults to `None`):
-                The language of the transcription text.
-            task (`str`, *optional*, defaults to `None`):
-                Task identifier to append at the start of sequence (if any).
-            predict_timestamps (`bool`, *optional*, defaults to `None`):
-                Whether to omit the `<|notimestamps|>` token at the start of the sequence.
-        """
-        self.language = language if language is not None else self.language
-        self.task = task if task is not None else self.task
-        self.predict_timestamps = predict_timestamps if predict_timestamps is not None else self.predict_timestamps
-
-        prefix_token_ids = self.prefix_tokens
-        prefixes = self.convert_ids_to_tokens(prefix_token_ids)
-        eos = self.eos_token
-        eos_token_id = self.eos_token_id
-        prefix_template = " ".join([f"{token}:0" for token in prefixes])
-        self.backend_tokenizer.post_processor = processors.TemplateProcessing(
-            single=f"{prefix_template} $A:0 {eos}:0",
-            pair=f"{prefix_template} $A:0 $B:1 {eos}:1",
-            special_tokens=[
-                (eos, eos_token_id),
-                *zip(prefixes, prefix_token_ids),
-            ],
-        )
-
-    @property
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.prefix_tokens
-    def prefix_tokens(self) -> list[int]:
-        bos_token_id = self.convert_tokens_to_ids("<|startoftranscript|>")
-        translate_token_id = self.convert_tokens_to_ids("<|translate|>")
-        transcribe_token_id = self.convert_tokens_to_ids("<|transcribe|>")
-        notimestamps_token_id = self.convert_tokens_to_ids("<|notimestamps|>")
-        langs = tuple(LANGUAGES.keys())
-
-        if self.language is not None:
-            self.language = self.language.lower()
-            if self.language in TO_LANGUAGE_CODE:
-                language_id = TO_LANGUAGE_CODE[self.language]
-            elif self.language in TO_LANGUAGE_CODE.values():
-                language_id = self.language
-            else:
-                is_language_code = len(self.language) == 2
-                raise ValueError(
-                    f"Unsupported language: {self.language}. Language should be one of:"
-                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
-                )
-
-        if self.task is not None:
-            if self.task not in TASK_IDS:
-                raise ValueError(f"Unsupported task: {self.task}. Task should be in: {TASK_IDS}")
-
-        bos_sequence = [bos_token_id]
-        if self.language is not None:
-            bos_sequence.append(bos_token_id + 1 + langs.index(language_id))
-        if self.task is not None:
-            bos_sequence.append(transcribe_token_id if self.task == "transcribe" else translate_token_id)
-        if not self.predict_timestamps:
-            bos_sequence.append(notimestamps_token_id)
-        return bos_sequence
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.build_inputs_with_special_tokens
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
-        """Build model inputs from a sequence by appending eos_token_id."""
-        if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + [self.eos_token_id]
-        # We don't expect to process pairs, but leave the pair logic for API consistency
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + [self.eos_token_id]
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_special_tokens_mask
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        prefix_ones = [1] * len(self.prefix_tokens)
-        suffix_ones = [1]
-        if token_ids_1 is None:
-            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
-        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_decoder_prompt_ids
-    def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
-        self.set_prefix_tokens(task=task, language=language, predict_timestamps=not no_timestamps)
-        # prefix tokens are of the form: <|startoftranscript|> <|lang_id|> <|task|> <|notimestamps|>
-        # we don't want to force the bos token at position 1, as this is the starting token
-        # when we generate, so we slice the prefix tokens to: <|lang_id|> <|task|> <|notimestamps|>
-        # to get the forced tokens
-        forced_tokens = self.prefix_tokens[1:]
-        forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_tokens)]
-        return forced_decoder_ids
-
-    def _decode_asr(self, model_outputs, *, return_timestamps, return_language, time_precision):
-        return _decode_asr(
-            self,
-            model_outputs,
-            return_timestamps=return_timestamps,
-            return_language=return_language,
-            time_precision=time_precision,
-        )
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer.get_prompt_ids
-    def get_prompt_ids(self, text: str, return_tensors="np"):
-        """Converts prompt text to IDs that can be passed to [`~WhisperForConditionalGeneration.generate`]."""
-        batch_encoding = self("<|startofprev|>", " " + text.strip(), add_special_tokens=False)
-
-        # Check for special tokens
-        prompt_text_ids = batch_encoding["input_ids"][1:]
-        special_token_id = next((x for x in prompt_text_ids if x >= self.all_special_ids[0]), None)
-        if special_token_id is not None:
-            token = self.convert_ids_to_tokens(special_token_id)
-            raise ValueError(f"Encountered text in the prompt corresponding to disallowed special token: {token}.")
-
-        batch_encoding.convert_to_tensors(tensor_type=return_tensors)
-        return batch_encoding["input_ids"]
-
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._strip_prompt
-    def _strip_prompt(self, token_ids: list[int], prompt_token_id: int, decoder_start_token_id: int):
-        if not isinstance(token_ids, list):
-            token_ids = self._convert_to_list(token_ids)
-
-        # handle case of empty token_ids for decoding with timestamps.
-        # at this point token_ids is a list, so it is safe to use if not check.
-        if not token_ids:
-            return token_ids
-
-        has_prompt = token_ids[0] == prompt_token_id
-        if has_prompt:
-            if decoder_start_token_id in token_ids:
-                return token_ids[token_ids.index(decoder_start_token_id) :]
-            else:
-                return []
-
-        return token_ids
-
-    @staticmethod
-    # Copied from transformers.models.whisper.tokenization_whisper.WhisperTokenizer._convert_to_list
-    def _convert_to_list(token_ids):
-        # convert type to ndarray if necessary
-        if hasattr(token_ids, "numpy"):
-            token_ids = token_ids.cpu().numpy()
-        # now the token ids are either a numpy array, or a list of lists
-        if isinstance(token_ids, np.ndarray):
-            token_ids = token_ids.tolist()
-        return token_ids
-
-
-__all__ = ["WhisperTokenizerFast"]
diff --git a/src/transformers/models/xglm/__init__.py b/src/transformers/models/xglm/__init__.py
index 363babae7e6c..33e620386f46 100644
--- a/src/transformers/models/xglm/__init__.py
+++ b/src/transformers/models/xglm/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_xglm import *
     from .modeling_xglm import *
     from .tokenization_xglm import *
-    from .tokenization_xglm_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/xglm/tokenization_xglm.py b/src/transformers/models/xglm/tokenization_xglm.py
index 9e0a8706683f..e25ec6c16009 100644
--- a/src/transformers/models/xglm/tokenization_xglm.py
+++ b/src/transformers/models/xglm/tokenization_xglm.py
@@ -12,291 +12,123 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for ."""
+"""Tokenization classes for XGLM."""
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
-
-@requires(backends=("sentencepiece",))
-class XGLMTokenizer(PreTrainedTokenizer):
+class XGLMTokenizer(TokenizersBackend):
     """
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a XGLM tokenizer (backed by HuggingFace's tokenizers library). Based on BPE.
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
+        tokenizer_file (`str`, *optional*):
+            Path to a tokenizers JSON file containing the serialization of a tokenizer.
         bos_token (`str`, *optional*, defaults to `""`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
         eos_token (`str`, *optional*, defaults to `""`):
             The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
         sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
+            The separator token, which is used when building a sequence from multiple sequences.
         cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+            The classifier token which is used when doing sequence classification.
         unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
+            The unknown token.
         pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+            The token used for padding.
+        vocab (`dict`, *optional*):
+            Custom vocabulary dictionary. If not provided, a minimal vocabulary is created.
+        merges (`list[tuple[str, str]]`, *optional*):
+            Custom merge rules for BPE. If not provided, merges are generated from the vocabulary.
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
+            Whether to add a prefix space before encoding.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        bos_token: str = "",
+        eos_token: str = "",
+        sep_token: str = "",
+        cls_token: str = "",
+        unk_token: str = "",
+        pad_token: str = "",
+        vocab: Optional[dict] = None,
+        merges: Optional[list[tuple[str, str]]] = None,
+        add_prefix_space: bool = True,
         **kwargs,
-    ) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        # Compatibility with the original tokenizer
+    ):
         self.num_madeup_words = 7
         madeup_words = [f"" for i in range(self.num_madeup_words)]
-
         kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
         kwargs["additional_special_tokens"] += [
             word for word in madeup_words if word not in kwargs["additional_special_tokens"]
         ]
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
-
-        sp_size = len(self.sp_model)
-        madeup_words = {f"": sp_size + i + self.fairseq_offset for i in range(self.num_madeup_words)}
-        self.fairseq_tokens_to_ids.update(madeup_words)
+        self.add_prefix_space = add_prefix_space
+
+        if vocab is not None:
+            self._vocab = vocab
+        else:
+            self._vocab = [
+                (str(bos_token), 0.0),
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(unk_token), 0.0),
+            ]
+
+        self._tokenizer = Tokenizer(Unigram(vocab=self._vocab, unk_id=3, byte_fallback=False))
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Replace(Regex(r"[\n\r\t]"), " "),
+                normalizers.NFKC(),
+                normalizers.Replace(Regex(r" {2,}"), " "),
+            ]
+        )
+        prepend_scheme = "always" if add_prefix_space else "never"
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
 
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
-            sp_model_kwargs=self.sp_model_kwargs,
+            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.sep_token_id] + token_ids_0
-        sep = [self.sep_token_id]
-        return sep + token_ids_0 + sep + sep + token_ids_1
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0))
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1))
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-
-        if token_ids_1 is None:
-            return len(sep + token_ids_0) * [0]
-        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset + self.num_madeup_words
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"{self.eos_token} $A {self.eos_token}",
+            pair=f"{self.eos_token} $A {self.eos_token} {self.eos_token} $B {self.eos_token}",
+            special_tokens=[
+                (self.bos_token, self.bos_token_id),
+                (self.eos_token, self.eos_token_id),
+            ],
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
-
 
 __all__ = ["XGLMTokenizer"]
diff --git a/src/transformers/models/xglm/tokenization_xglm_fast.py b/src/transformers/models/xglm/tokenization_xglm_fast.py
deleted file mode 100644
index a9c8b3aac257..000000000000
--- a/src/transformers/models/xglm/tokenization_xglm_fast.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for XGLM."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_xglm import XGLMTokenizer
-else:
-    XGLMTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-class XGLMTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
-    and [`XLNetTokenizer`]. Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = XGLMTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        **kwargs,
-    ):
-        # Compatibility with the original tokenizer
-        self.num_madeup_words = 7
-        madeup_words = [f"" for i in range(self.num_madeup_words)]
-
-        kwargs["additional_special_tokens"] = kwargs.get("additional_special_tokens", []) or []
-        kwargs["additional_special_tokens"] += [
-            word for word in madeup_words if word not in kwargs["additional_special_tokens"]
-        ]
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.sep_token_id] + token_ids_0
-        sep = [self.sep_token_id]
-        return sep + token_ids_0 + sep + sep + token_ids_1
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-
-        if token_ids_1 is None:
-            return len(sep + token_ids_0) * [0]
-        return len(sep + token_ids_0 + sep + sep + token_ids_1) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["XGLMTokenizerFast"]
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index 8c4471a38436..9616995fd0dc 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -21,7 +21,7 @@
 import unicodedata
 from typing import Optional
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_python import PreTrainedTokenizer
 from ...utils import logging
 
 
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 149a09f5ed61..f06156f5556a 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -12,290 +12,118 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
-"""Tokenization classes for XLM-RoBERTa model."""
+"""Tokenization classes for XLM-RoBERTa model (Tokenizers backend)."""
 
-import os
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_tokenizers import TokenizersBackend
 from ...utils import logging
-from ...utils.import_utils import requires
 
 
 logger = logging.get_logger(__name__)
 
-SPIECE_UNDERLINE = "▁"
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
 
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
 
+class XLMRobertaTokenizer(TokenizersBackend):
+    r"""
+    Construct an XLM-RoBERTa tokenizer (backed by HuggingFace's tokenizers library). Based on SentencePiece.
 
-@requires(backends=("sentencepiece",))
-class XLMRobertaTokenizer(PreTrainedTokenizer):
-    """
-    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [SentencePiece](https://github.com/google/sentencepiece).
-
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should refer to
     this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+        vocab_file (`str`, optional): Path to the vocabulary file.
+        merges_file (`str`, optional): Path to the merges file.
+        tokenizer_file (`str`, optional): Path to a tokenizers JSON file containing the serialization of a tokenizer.
+        bos_token (`str`, optional, defaults to `""`): The beginning of sequence token.
+        eos_token (`str`, optional, defaults to `""`): The end of sequence token.
+        sep_token (`str`, optional, defaults to `""`): The separator token.
+        cls_token (`str`, optional, defaults to `""`): The classifier token.
+        unk_token (`str`, optional, defaults to `""`): The unknown token.
+        pad_token (`str`, optional, defaults to `""`): The padding token.
+        mask_token (`str`, optional, defaults to `""`): The mask token.
+        add_prefix_space (`bool`, optional, defaults to `True`): Whether to add an initial space.
+        vocab (`dict`, optional): Custom vocabulary dictionary.
+        merges (`list`, optional): Custom merges list.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
     model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = None
 
     def __init__(
         self,
-        vocab_file,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        bos_token: str = "",
+        eos_token: str = "",
+        sep_token: str = "",
+        cls_token: str = "",
+        unk_token: str = "",
+        pad_token: str = "",
+        mask_token: str = "",
+        add_prefix_space: bool = True,
+        vocab: Optional[dict] = None,
+        vocab_file: Optional[str] = None,
         **kwargs,
-    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
-
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(str(vocab_file))
-        self.vocab_file = vocab_file
-
-        # Original fairseq vocab and spm vocab must be "aligned":
-        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
-        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
-        # fairseq  | ''   | '' | '' | '' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
-        # spm      | '' | ''   | '' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
-
-        # Mimic fairseq token-to-id alignment for the first 4 token
-        self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3}
+    ):
+        self.add_prefix_space = add_prefix_space
+
+        if vocab is not None:
+            self._vocab = vocab
+        else:
+            self._vocab = [
+                (str(bos_token), 0.0),
+                (str(pad_token), 0.0),
+                (str(eos_token), 0.0),
+                (str(unk_token), 0.0),
+                (str(mask_token), 0.0),
+            ]
+
+        self._tokenizer = Tokenizer(Unigram(vocab=self._vocab, unk_id=3, byte_fallback=False))
+
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Strip(left=False, right=True),
+                normalizers.Replace(" {2,}", "▁"),
+            ]
+        )
 
-        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
-        self.fairseq_offset = 1
+        prepend_scheme = "always" if add_prefix_space else "never"
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.WhitespaceSplit(),
+                pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme),
+            ]
+        )
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
 
-        self.fairseq_tokens_to_ids[""] = len(self.sp_model) + self.fairseq_offset
-        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
             bos_token=bos_token,
             eos_token=eos_token,
-            unk_token=unk_token,
             sep_token=sep_token,
             cls_token=cls_token,
+            unk_token=unk_token,
             pad_token=pad_token,
             mask_token=mask_token,
-            sp_model_kwargs=self.sp_model_kwargs,
+            add_prefix_space=add_prefix_space,
             **kwargs,
         )
 
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is None:
-            return [1] + ([0] * len(token_ids_0)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model) + self.fairseq_offset + 1  # Add the  token
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def _tokenize(self, text: str) -> list[str]:
-        # TODO check if the t5/llama PR also applies here
-        return self.sp_model.encode(text, out_type=str)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        if token in self.fairseq_tokens_to_ids:
-            return self.fairseq_tokens_to_ids[token]
-        spm_id = self.sp_model.PieceToId(token)
-
-        # Need to return unknown token if the SP model returned 0
-        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        if index in self.fairseq_ids_to_tokens:
-            return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index - self.fairseq_offset)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=["$A", ""],
+            pair=["$A", "", "$B", ""],
+            special_tokens=[
+                ("", self.eos_token_id),
+            ],
         )
 
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
+        self.vocab_file = vocab_file
 
 
 __all__ = ["XLMRobertaTokenizer"]
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
deleted file mode 100644
index bcdea2325fc1..000000000000
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License
-"""Tokenization classes for XLM-RoBERTa model."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_xlm_roberta import XLMRobertaTokenizer
-else:
-    XLMRobertaTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
-
-
-class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
-    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
-    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`):
-            Additional special tokens used by the tokenizer.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = XLMRobertaTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        bos_token="",
-        eos_token="",
-        sep_token="",
-        cls_token="",
-        unk_token="",
-        pad_token="",
-        mask_token="",
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file,
-            tokenizer_file=tokenizer_file,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            sep_token=sep_token,
-            cls_token=cls_token,
-            unk_token=unk_token,
-            pad_token=pad_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLM-RoBERTa sequence has the following format:
-
-        - single sequence: ` X `
-        - pair of sequences: ` A  B `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
-        not make use of token type ids, therefore a list of zeros is returned.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of zeros.
-
-        """
-
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["XLMRobertaTokenizerFast"]
diff --git a/src/transformers/models/xlnet/__init__.py b/src/transformers/models/xlnet/__init__.py
index 73fe8d46985c..5734b98957c6 100644
--- a/src/transformers/models/xlnet/__init__.py
+++ b/src/transformers/models/xlnet/__init__.py
@@ -21,7 +21,6 @@
     from .configuration_xlnet import *
     from .modeling_xlnet import *
     from .tokenization_xlnet import *
-    from .tokenization_xlnet_fast import *
 else:
     import sys
 
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index 9186db33d788..7b8ffdbb24ea 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -14,22 +14,21 @@
 # limitations under the License.
 """Tokenization classes for XLNet model."""
 
-import os
-import unicodedata
-from shutil import copyfile
-from typing import Any, Optional
+from typing import Optional
 
-import sentencepiece as spm
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
+from tokenizers.models import Unigram
 
-from ...tokenization_utils import AddedToken, PreTrainedTokenizer
-from ...utils import SPIECE_UNDERLINE, logging
-from ...utils.import_utils import requires
+from ...tokenization_utils_base import _get_prepend_scheme
+from ...tokenization_utils_tokenizers import TokenizersBackend
+from ...utils import logging
 
 
 logger = logging.get_logger(__name__)
 
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
 
+SPIECE_UNDERLINE = "▁"
 
 # Segments (not really needed)
 SEG_ID_A = 0
@@ -39,18 +38,19 @@
 SEG_ID_PAD = 4
 
 
-@requires(backends=("sentencepiece",))
-class XLNetTokenizer(PreTrainedTokenizer):
+class XLNetTokenizer(TokenizersBackend):
     """
-    Construct an XLNet tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+    Construct a XLNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
-    this superclass for more information regarding those methods.
+    This tokenizer inherits from [`TokenizersBackend`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
+        vocab (`list of tuples`, *optional*):
+            List of (token, score) tuples for Unigram model. If not provided, an empty list is used.
+        unk_id (`int`, *optional*, defaults to 0):
+            The ID of the unknown token in the vocabulary.
         do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether to lowercase the input when tokenizing.
         remove_space (`bool`, *optional*, defaults to `True`):
@@ -92,27 +92,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
         mask_token (`str`, *optional*, defaults to `""`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `['', '']`):
+        additional_special_tokens (`list[str]`, *optional*, defaults to `["", ""]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (`dict`, *optional*):
-            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-            to set:
-
-            - `enable_sampling`: Enable subword regularization.
-            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-
-              - `nbest_size = {0,1}`: No sampling is performed.
-              - `nbest_size > 1`: samples from the nbest_size results.
-              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                using forward-filtering-and-backward-sampling algorithm.
-
-            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-              BPE-dropout.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -120,7 +101,8 @@ class XLNetTokenizer(PreTrainedTokenizer):
 
     def __init__(
         self,
-        vocab_file,
+        vocab: Optional[list] = None,
+        unk_id: int = 0,
         do_lower_case=False,
         remove_space=True,
         keep_accents=False,
@@ -131,24 +113,60 @@ def __init__(
         pad_token="",
         cls_token="",
         mask_token="",
-        additional_special_tokens=["", ""],
-        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        additional_special_tokens=None,
         **kwargs,
-    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+    ):
+        if additional_special_tokens is None:
+            additional_special_tokens = ["", ""]
 
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        if vocab is not None:
+            self._vocab = vocab
+        else:
+            self._vocab = [(str(unk_token), 0.0)]
+
+        self._tokenizer = Tokenizer(
+            Unigram(
+                self._vocab,
+                unk_id=unk_id,
+                byte_fallback=False,
+            )
+        )
 
+        list_normalizers = [
+            normalizers.Replace("``", '"'),
+            normalizers.Replace("''", '"'),
+        ]
+        #  if not keep_accents:
+        list_normalizers.append(normalizers.NFKD())
+        list_normalizers.append(normalizers.StripAccents())
+        if do_lower_case:
+            list_normalizers.append(normalizers.Lowercase())
+
+        list_normalizers.append(normalizers.Replace(Regex(" {2,}"), " "))
+        self._tokenizer.normalizer = normalizers.Sequence(list_normalizers)
+
+        add_prefix_space = True
+        prepend_scheme = _get_prepend_scheme(add_prefix_space, self)
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.WhitespaceSplit(),
+                pre_tokenizers.Metaspace(replacement="▁", prepend_scheme=prepend_scheme),
+            ]
+        )
+
+        self._tokenizer.decoder = decoders.Metaspace(replacement="▁", prepend_scheme=prepend_scheme)
+        self._pad_token_type_id = 3
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
 
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(vocab_file)
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        tokenizer_object = self._tokenizer
 
         super().__init__(
+            tokenizer_object=tokenizer_object,
+            unk_id=unk_id,
             do_lower_case=do_lower_case,
             remove_space=remove_space,
             keep_accents=keep_accents,
@@ -160,228 +178,17 @@ def __init__(
             cls_token=cls_token,
             mask_token=mask_token,
             additional_special_tokens=additional_special_tokens,
-            sp_model_kwargs=self.sp_model_kwargs,
             **kwargs,
         )
 
-        self._pad_token_type_id = 3
-
-    @property
-    def vocab_size(self):
-        return len(self.sp_model)
-
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["sp_model"] = None
-        return state
-
-    def __setstate__(self, d):
-        self.__dict__ = d
-
-        # for backward compatibility
-        if not hasattr(self, "sp_model_kwargs"):
-            self.sp_model_kwargs = {}
-
-        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
-        self.sp_model.Load(self.vocab_file)
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-        outputs = outputs.replace("``", '"').replace("''", '"')
-
-        if not self.keep_accents:
-            outputs = unicodedata.normalize("NFKD", outputs)
-            outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text: str) -> list[str]:
-        """Tokenize a string."""
-        text = self.preprocess_text(text)
-        pieces = self.sp_model.encode(text, out_type=str)
-        new_pieces = []
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
-                cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
-                if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_model.PieceToId(token)
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_model.IdToPiece(index)
-
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (strings for sub-words) in a single string."""
-        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
-        return out_string
-
-    def _decode(
-        self,
-        token_ids: list[int],
-        skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
-        spaces_between_special_tokens: bool = True,
-        **kwargs,
-    ) -> str:
-        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separately for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
-                continue
-            if token in self.added_tokens_encoder:
-                if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
-        # Mimic the behavior of the Rust tokenizer:
-        # By default, there are no spaces between special tokens
-        text = "".join(sub_texts)
-
-        clean_up_tokenization_spaces = (
-            clean_up_tokenization_spaces
-            if clean_up_tokenization_spaces is not None
-            else self.clean_up_tokenization_spaces
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=f"$A:0 {str(self.sep_token)}:0 {str(self.cls_token)}:2",
+            pair=f"$A:0 {str(self.sep_token)}:0 $B:1 {str(self.sep_token)}:1 {str(self.cls_token)}:2",
+            special_tokens=[
+                (str(self.sep_token), self.sep_token_id),
+                (str(self.cls_token), self.cls_token_id),
+            ],
         )
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLNet sequence has the following format:
-
-        - single sequence: `X  `
-        - pair of sequences: `A  B  `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return token_ids_0 + sep + cls
-        return token_ids_0 + sep + token_ids_1 + sep + cls
-
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` method.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            return super().get_special_tokens_mask(
-                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
-            )
-
-        if token_ids_1 is not None:
-            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1]
-        return ([0] * len(token_ids_0)) + [1, 1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls_segment_id = [2]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + sep) * [0] + cls_segment_id
-        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, "wb") as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-
-        return (out_vocab_file,)
 
 
 __all__ = ["XLNetTokenizer"]
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
deleted file mode 100644
index 56cd2a50e1b2..000000000000
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for XLNet model."""
-
-import os
-from shutil import copyfile
-from typing import Optional
-
-from ...tokenization_utils import AddedToken
-from ...tokenization_utils_fast import PreTrainedTokenizerFast
-from ...utils import is_sentencepiece_available, logging
-
-
-if is_sentencepiece_available():
-    from .tokenization_xlnet import XLNetTokenizer
-else:
-    XLNetTokenizer = None
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
-
-
-SPIECE_UNDERLINE = "▁"
-
-# Segments (not really needed)
-SEG_ID_A = 0
-SEG_ID_B = 1
-SEG_ID_CLS = 2
-SEG_ID_SEP = 3
-SEG_ID_PAD = 4
-
-
-class XLNetTokenizerFast(PreTrainedTokenizerFast):
-    """
-    Construct a "fast" XLNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on
-    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
-
-    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
-
-    Args:
-        vocab_file (`str`):
-            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
-            contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (`bool`, *optional*, defaults to `True`):
-            Whether to lowercase the input when tokenizing.
-        remove_space (`bool`, *optional*, defaults to `True`):
-            Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (`bool`, *optional*, defaults to `False`):
-            Whether to keep accents when tokenizing.
-        bos_token (`str`, *optional*, defaults to `""`):
-            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the `cls_token`.
-
-            
-
-        eos_token (`str`, *optional*, defaults to `""`):
-            The end of sequence token.
-
-            
-
-            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
-            The token used is the `sep_token`.
-
-            
-
-        unk_token (`str`, *optional*, defaults to `""`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (`str`, *optional*, defaults to `""`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
-        pad_token (`str`, *optional*, defaults to `""`):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (`str`, *optional*, defaults to `""`):
-            The classifier token which is used when doing sequence classification (classification of the whole sequence
-            instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (`str`, *optional*, defaults to `""`):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`list[str]`, *optional*, defaults to `["", ""]`):
-            Additional special tokens used by the tokenizer.
-
-    Attributes:
-        sp_model (`SentencePieceProcessor`):
-            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    padding_side = "left"
-    slow_tokenizer_class = XLNetTokenizer
-
-    def __init__(
-        self,
-        vocab_file=None,
-        tokenizer_file=None,
-        do_lower_case=False,
-        remove_space=True,
-        keep_accents=False,
-        bos_token="",
-        eos_token="",
-        unk_token="",
-        sep_token="",
-        pad_token="",
-        cls_token="",
-        mask_token="",
-        additional_special_tokens=["", ""],
-        **kwargs,
-    ):
-        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
-
-        super().__init__(
-            vocab_file=vocab_file,
-            tokenizer_file=tokenizer_file,
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            keep_accents=keep_accents,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            additional_special_tokens=additional_special_tokens,
-            **kwargs,
-        )
-
-        self._pad_token_type_id = 3
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.keep_accents = keep_accents
-        self.vocab_file = vocab_file
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An XLNet sequence has the following format:
-
-        - single sequence: `X  `
-        - pair of sequences: `A  B  `
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return token_ids_0 + sep + cls
-        return token_ids_0 + sep + token_ids_1 + sep + cls
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
-        sequence pair mask has the following format:
-
-        ```
-        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence |
-        ```
-
-        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of IDs.
-            token_ids_1 (`list[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls_segment_id = [2]
-
-        if token_ids_1 is None:
-            return len(token_ids_0 + sep) * [0] + cls_segment_id
-        return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
-        if not self.can_save_slow_tokenizer:
-            raise ValueError(
-                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
-                "tokenizer."
-            )
-
-        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
-            return
-        out_vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-
-        return (out_vocab_file,)
-
-
-__all__ = ["XLNetTokenizerFast"]
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 4e804b83605d..d13dd9bf891d 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -31,7 +31,7 @@
 from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
 from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
 from ..processing_utils import ProcessorMixin
-from ..tokenization_utils import PreTrainedTokenizer
+from ..tokenization_python import PreTrainedTokenizer
 from ..utils import (
     CONFIG_NAME,
     cached_file,
@@ -127,7 +127,7 @@
 
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
-    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+    from ..tokenization_utils_tokenizers import PreTrainedTokenizerFast
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/transformers/pipelines/any_to_any.py b/src/transformers/pipelines/any_to_any.py
index e5febf875d7e..a72c593a7c3f 100644
--- a/src/transformers/pipelines/any_to_any.py
+++ b/src/transformers/pipelines/any_to_any.py
@@ -369,7 +369,7 @@ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, **p
 
             # Handle Mistral tokenizer which does not accept processing kwargs
             chat_template_kwargs = {"add_generation_prompt": not continue_final_message, **processing_kwargs}
-            if self.processor.tokenizer.__class__.__name__ == "MistralCommonTokenizer":
+            if self.processor.tokenizer.__class__.__name__ == "MistralCommonBackend":
                 chat_template_kwargs = {
                     k: v for k, v in chat_template_kwargs.items() if k in ["padding", "truncation", "max_length"]
                 }
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index 37034ad94f94..f09c529072f8 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from ..generation import GenerationConfig
-from ..tokenization_utils import PreTrainedTokenizer
+from ..tokenization_python import PreTrainedTokenizer
 from ..utils import is_torch_available, is_torchaudio_available, is_torchcodec_available, logging
 from .audio_utils import ffmpeg_read
 from .base import ChunkPipeline
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index b51873aed8ee..76db996925fe 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -35,7 +35,7 @@
 from ..modelcard import ModelCard
 from ..models.auto import AutoConfig, AutoTokenizer
 from ..processing_utils import ProcessorMixin
-from ..tokenization_utils import PreTrainedTokenizer
+from ..tokenization_python import PreTrainedTokenizer
 from ..utils import (
     ModelOutput,
     PushToHubMixin,
diff --git a/src/transformers/pipelines/deprecated/text2text_generation.py b/src/transformers/pipelines/deprecated/text2text_generation.py
index 54e1b2873041..d1b4c4855f25 100644
--- a/src/transformers/pipelines/deprecated/text2text_generation.py
+++ b/src/transformers/pipelines/deprecated/text2text_generation.py
@@ -3,7 +3,7 @@
 from typing import Any
 
 from ...generation import GenerationConfig
-from ...tokenization_utils import TruncationStrategy
+from ...tokenization_python import TruncationStrategy
 from ...utils import add_end_docstrings, is_torch_available, logging
 from ..base import Pipeline, build_pipeline_init_args
 
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 51d4203fcd56..146465c28ffe 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -8,7 +8,7 @@
 
 from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
 from ..modelcard import ModelCard
-from ..tokenization_utils import PreTrainedTokenizer
+from ..tokenization_python import PreTrainedTokenizer
 from ..utils import (
     PaddingStrategy,
     add_end_docstrings,
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 412c11c875b3..1e7b2bec2c0f 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -4,7 +4,7 @@
 
 import numpy as np
 
-from ..models.bert.tokenization_bert import BasicTokenizer
+from ..models.bert.tokenization_bert_legacy import BasicTokenizer
 from ..utils import (
     ExplicitEnum,
     add_end_docstrings,
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index 3eca4ea4eb9c..2f9df70e6a14 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from ..tokenization_utils import TruncationStrategy
+from ..tokenization_python import TruncationStrategy
 from ..utils import add_end_docstrings, logging
 from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
index ff24b9df710c..ca15054f5e66 100644
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -124,7 +124,7 @@ def keys(self):
 MODALITY_TO_BASE_CLASS_MAPPING = {
     "audio_tokenizer": "DacModel",
     "audio_processor": "FeatureExtractionMixin",
-    "tokenizer": ("PreTrainedTokenizerBase", "MistralCommonTokenizer"),
+    "tokenizer": ("PreTrainedTokenizerBase", "MistralCommonBackend"),
     "feature_extractor": "FeatureExtractionMixin",
     "image_processor": "ImageProcessingMixin",
     "video_processor": "BaseVideoProcessor",
@@ -1412,6 +1412,21 @@ def get_attributes(cls):
                 continue
             if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
                 attributes.append(sub_processor_type)
+
+        # Legacy processors may not override `__init__` and instead expose modality
+        # attributes via `_class`. In that case, `args_in_init` only exposes
+        # `*args`/`**kwargs`, so we need to infer the attributes from those class-level
+        # hints to keep backward compatibility (e.g. dynamic processors stored on the Hub).
+        if not attributes:
+            for attribute_name, value in cls.__dict__.items():
+                if value is None or attribute_name == "audio_tokenizer_class" or not attribute_name.endswith("_class"):
+                    continue
+                inferred_attribute = attribute_name[: -len("_class")]
+                if inferred_attribute == "audio_tokenizer":
+                    continue
+                if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute:
+                    attributes.append(inferred_attribute)
+
         return attributes
 
     @classmethod
@@ -1584,7 +1599,14 @@ def apply_chat_template(
                 # It's a template string, render it directly
                 pass
 
-        is_tokenizers_fast = hasattr(self, "tokenizer") and self.tokenizer.__class__.__name__.endswith("Fast")
+        # Check if tokenizer is fast - use backend attribute if available, otherwise fall back to class name
+        is_tokenizers_fast = False
+        if hasattr(self, "tokenizer"):
+            if hasattr(self.tokenizer, "backend"):
+                is_tokenizers_fast = self.tokenizer.backend == "tokenizers"
+            else:
+                # Fallback to class name check
+                is_tokenizers_fast = self.tokenizer.__class__.__name__.endswith("Fast")
 
         if kwargs.get("continue_final_message", False):
             if kwargs.get("add_generation_prompt", False):
@@ -1674,11 +1696,19 @@ def apply_chat_template(
                 batch_images.append(images)
                 batch_videos.append(videos)
 
+        special_tokens_map = {}
+        if hasattr(self, "tokenizer") and hasattr(self.tokenizer, "special_tokens_map"):
+            special_tokens = self.tokenizer.special_tokens_map
+            # Filter out tokens that conflict with template kwargs
+            special_tokens_map = {
+                k: v for k, v in special_tokens.items() if k not in processed_kwargs["template_kwargs"]
+            }
+
         prompt, generation_indices = render_jinja_template(
             conversations=conversations,
             chat_template=chat_template,
             **processed_kwargs["template_kwargs"],  # different flags such as `return_assistant_mask`
-            **self.tokenizer.special_tokens_map,  # tokenizer special tokens are used by some templates
+            **special_tokens_map,  # tokenizer special tokens are used by some templates
         )
 
         if not is_batched:
@@ -1727,10 +1757,14 @@ def apply_chat_template(
 
                             if not (
                                 start_pos >= 0
+                                and start_pos < len(offsets)
                                 and offsets[start_pos][0] <= assistant_start_char < offsets[start_pos][1]
                             ):
                                 # start_token is out of bounds maybe due to truncation.
                                 continue
+                            # Ensure end_pos is also within bounds
+                            if end_pos > len(input_ids[i]):
+                                end_pos = len(input_ids[i])
                             for token_id in range(start_pos, end_pos if end_pos else len(input_ids[i])):
                                 current_mask[token_id] = 1
                         assistant_masks.append(current_mask)
@@ -1779,14 +1813,14 @@ def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens
                 The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                 or `(sequence_length,)`.
             skip_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
+                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `decode` method.
             **kwargs:
-                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+                Additional arguments to be passed to the tokenizer's `decode` method.
 
         Returns:
             `list[str]`: The decoded text.
         """
-        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
+        return self.tokenizer.decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
 
     def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
         """
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index bdaf22e81478..89bc2d750b28 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -42,7 +42,7 @@
 from functools import cache, wraps
 from io import StringIO
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from unittest import mock
 from unittest.mock import patch
 
@@ -51,9 +51,14 @@
 from huggingface_hub import create_repo, delete_repo
 from packaging import version
 
-from transformers import Trainer
 from transformers import logging as transformers_logging
 
+
+if TYPE_CHECKING:
+    from .trainer import Trainer
+else:
+    Trainer = Any  # type: ignore
+
 from .integrations import (
     is_clearml_available,
     is_optuna_available,
diff --git a/src/transformers/tokenization_mistral_common.py b/src/transformers/tokenization_mistral_common.py
index c79c42bb52b9..a98a2e7004dc 100644
--- a/src/transformers/tokenization_mistral_common.py
+++ b/src/transformers/tokenization_mistral_common.py
@@ -30,7 +30,6 @@
     BatchEncoding,
     EncodedInput,
     PreTokenizedInput,
-    PreTrainedTokenizerBase,
     TextInput,
     TruncationStrategy,
 )
@@ -151,7 +150,7 @@ class MistralTokenizerType(str, Enum):
 
 
 @requires(backends=("mistral-common",))
-class MistralCommonTokenizer(PushToHubMixin):
+class MistralCommonBackend(PushToHubMixin):
     """
     Class to wrap `mistral-common` tokenizers.
 
@@ -170,23 +169,23 @@ class MistralCommonTokenizer(PushToHubMixin):
 
     Supports the following methods from the `PreTrainedTokenizerBase` class:
 
-    - [`~MistralCommonTokenizer.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
+    - [`~MistralCommonBackend.get_vocab`]: Returns the vocabulary as a dictionary of token to index.
         This is a lossy conversion for Tekkenizer as some decoding errors are collapsed into the same token.
-    - [`~MistralCommonTokenizer.encode`]: Encode a string to a list of integers.
-    - [`~MistralCommonTokenizer.decode`]: Decode a list of integers to a string.
-    - [`~MistralCommonTokenizer.batch_decode`]: Decode a batch of list of integers to a list of strings.
-    - [`~MistralCommonTokenizer.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
-    - [`~MistralCommonTokenizer.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
-    - [`~MistralCommonTokenizer.tokenize`]: Tokenize a string.
-    - [`~MistralCommonTokenizer.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
-    - [`~MistralCommonTokenizer.prepare_for_model`]: Prepare a list of inputs for the model.
-    - [`~MistralCommonTokenizer.pad`]: Pad a list of inputs to the same length.
-    - [`~MistralCommonTokenizer.truncate_sequences`]: Truncate a list of sequences to the same length.
-    - [`~MistralCommonTokenizer.apply_chat_template`]: Apply a chat template to a list of messages.
-    - [`~MistralCommonTokenizer.__call__`]: Tokenize a string or a list of strings.
-    - [`~MistralCommonTokenizer.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
-    - [`~MistralCommonTokenizer.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
-    - [`~MistralCommonTokenizer.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
+    - [`~MistralCommonBackend.encode`]: Encode a string to a list of integers.
+    - [`~MistralCommonBackend.decode`]: Decode a list of integers to a string.
+    - [`~MistralCommonBackend.batch_decode`]: Decode a batch of list of integers to a list of strings.
+    - [`~MistralCommonBackend.convert_tokens_to_ids`]: Convert a list of tokens to a list of integers.
+    - [`~MistralCommonBackend.convert_ids_to_tokens`]: Convert a list of integers to a list of tokens.
+    - [`~MistralCommonBackend.tokenize`]: Tokenize a string.
+    - [`~MistralCommonBackend.get_special_tokens_mask`]: Get the special tokens mask for a list of tokens.
+    - [`~MistralCommonBackend.prepare_for_model`]: Prepare a list of inputs for the model.
+    - [`~MistralCommonBackend.pad`]: Pad a list of inputs to the same length.
+    - [`~MistralCommonBackend.truncate_sequences`]: Truncate a list of sequences to the same length.
+    - [`~MistralCommonBackend.apply_chat_template`]: Apply a chat template to a list of messages.
+    - [`~MistralCommonBackend.__call__`]: Tokenize a string or a list of strings.
+    - [`~MistralCommonBackend.from_pretrained`]: Download and cache a pretrained tokenizer from the Hugging Face model hub or local directory.
+    - [`~MistralCommonBackend.save_pretrained`]: Save a tokenizer to a directory, so it can be reloaded using the `from_pretrained` class method.
+    - [`~MistralCommonBackend.push_to_hub`]: Upload tokenizer to the Hugging Face model hub.
 
     Here are the key differences with the `PreTrainedTokenizerBase` class:
 
@@ -214,7 +213,7 @@ def __init__(
         **kwargs,
     ):
         """
-        Constructs a `MistralCommonTokenizer`.
+        Constructs a `MistralCommonBackend`.
 
         - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
         - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
@@ -246,7 +245,7 @@ def __init__(
                 tokenization process.
         """
         if kwargs:
-            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonTokenizer`.")
+            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
 
         self._tokenizer_path = Path(tokenizer_path)
         self.tokenizer: MistralTokenizer = MistralTokenizer.from_file(str(self._tokenizer_path), mode=mode)
@@ -274,6 +273,24 @@ def __init__(
 
         self._cache_get_vocab: dict[str, int] | None = None
 
+    @staticmethod
+    def clean_up_tokenization(text: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuation.
+        """
+        return (
+            text.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+
     @property
     def bos_token_id(self) -> int:
         """
@@ -368,7 +385,7 @@ def __len__(self):
     @add_end_docstrings(
         ENCODE_KWARGS_DOCSTRING,
         """
-            **kwargs: Not supported by `MistralCommonTokenizer.encode`.
+            **kwargs: Not supported by `MistralCommonBackend.encode`.
                 Will raise an error if used.
         """,
         """
@@ -398,12 +415,12 @@ def encode(
             text (`str` or `List[int]`):
                 The first sequence to be encoded. This can be a string or a list of integers (tokenized string ids).
             text_pair (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
+                Not supported by `MistralCommonBackend.encode`. Kept to match `PreTrainedTokenizerBase.encode` signature.
         """
         if kwargs:
-            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.encode`.")
+            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.encode`.")
         if text_pair:
-            raise ValueError("`MistralCommonTokenizer.encode` does not support `text_pair`.")
+            raise ValueError("`MistralCommonBackend.encode` does not support `text_pair`.")
 
         padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
             padding=padding,
@@ -452,14 +469,14 @@ def decode(
                 Whether or not to clean up the tokenization spaces. If `None`, will default to
                 `self.clean_up_tokenization_spaces`.
             kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonTokenizer.decode`.
+                Not supported by `MistralCommonBackend.decode`.
                 Will raise an error if used.
 
         Returns:
             `str`: The decoded sentence.
         """
         if kwargs:
-            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.decode`.")
+            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.decode`.")
 
         clean_up_tokenization_spaces = clean_up_tokenization_spaces or self.cleanup_tokenization_spaces
 
@@ -470,7 +487,7 @@ def decode(
 
         decoded_string = self.tokenizer.decode(token_ids, special_token_policy=special_token_policy)
         if clean_up_tokenization_spaces:
-            decoded_string = PreTrainedTokenizerBase.clean_up_tokenization(decoded_string)
+            decoded_string = self.clean_up_tokenization(decoded_string)
 
         # in the specific case of Voxtral, the added f"lang:xx" (always a two char language code since it follows ISO 639-1 alpha-2 format)
         # is not considered as a special token by mistral-common and is encoded/ decoded as normal text.
@@ -499,7 +516,7 @@ def batch_decode(
                 Whether or not to clean up the tokenization spaces. If `None`, will default to
                 `self.clean_up_tokenization_spaces`.
             kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonTokenizer.batch_decode`.
+                Not supported by `MistralCommonBackend.batch_decode`.
                 Will raise an error if used.
 
         Returns:
@@ -630,14 +647,14 @@ def tokenize(self, text: TextInput, **kwargs) -> list[str]:
             text (`str`):
                 The sequence to be encoded.
             **kwargs (additional keyword arguments):
-                Not supported by `MistralCommonTokenizer.tokenize`.
+                Not supported by `MistralCommonBackend.tokenize`.
                 Will raise an error if used.
 
         Returns:
             `List[str]`: The list of tokens.
         """
         if kwargs:
-            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.tokenize`.")
+            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.tokenize`.")
 
         return self.convert_ids_to_tokens(self._text_to_ids(text, add_special_tokens=False), skip_special_tokens=False)
 
@@ -753,7 +770,7 @@ def get_special_tokens_mask(
             token_ids_0 (`List[int]`):
                 List of ids of the sequence.
             token_ids_1 (`List[int]`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`.
+                Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
             already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
@@ -762,11 +779,11 @@ def get_special_tokens_mask(
         """
         if token_ids_1 is not None:
             raise ValueError(
-                "`token_ids_1` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility."
+                "`token_ids_1` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
             )
         if already_has_special_tokens:
             raise ValueError(
-                "`already_has_special_tokens` is not supported by `MistralCommonTokenizer` and should be `False`."
+                "`already_has_special_tokens` is not supported by `MistralCommonBackend` and should be `False`."
             )
 
         all_special_ids = self._all_special_ids()  # cache the ids
@@ -868,15 +885,15 @@ def prepare_for_model(
             ids (`List[int]`):
                 Tokenized input ids of the first sequence.
             pair_ids (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the interface of `PreTrainedTokenizerBase`.
+                Not supported by `MistralCommonBackend`. Kept to match the interface of `PreTrainedTokenizerBase`.
         """
         if pair_ids is not None:
             raise ValueError(
-                "`pair_ids` is not supported by `MistralCommonTokenizer` and should be `None`, kept for compatibility."
+                "`pair_ids` is not supported by `MistralCommonBackend` and should be `None`, kept for compatibility."
             )
         if kwargs:
             raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.prepare_for_model`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.prepare_for_model`."
             )
 
         padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
@@ -994,7 +1011,7 @@ def _get_padding_truncation_strategies(
                 truncation_strategy = truncation
             if truncation in [TruncationStrategy.ONLY_FIRST, TruncationStrategy.ONLY_SECOND]:
                 raise ValueError(
-                    "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonTokenizer`."
+                    "Truncation strategy `only_first` and `only_second` are not supported by `MistralCommonBackend`."
                 )
         else:
             truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
@@ -1303,7 +1320,7 @@ def truncate_sequences(
                 Tokenized input ids. Can be obtained from a string by chaining the `tokenize` and
                 `convert_tokens_to_ids` methods.
             pair_ids (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`.
+                Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.truncate_sequences`.
             num_tokens_to_remove (`int`, *optional*, defaults to 0):
                 Number of tokens to remove using the truncation strategy.
             truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`):
@@ -1323,10 +1340,10 @@ def truncate_sequences(
         """
         if kwargs:
             raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.truncate_sequences`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.truncate_sequences`."
             )
         if pair_ids:
-            raise ValueError("`pair_ids` is not supported by `MistralCommonTokenizer.truncate_sequences`.")
+            raise ValueError("`pair_ids` is not supported by `MistralCommonBackend.truncate_sequences`.")
 
         if num_tokens_to_remove <= 0:
             return (ids, None, [])
@@ -1389,7 +1406,7 @@ def apply_chat_template(
                 [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
                 for more information.
             add_generation_prompt (`bool`, *optional*):
-                This argument is a no-op for `MistralCommonTokenizer`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
+                This argument is a no-op for `MistralCommonBackend`. However it cannot be used at the same time as `continue_final_message` to keep the API consistent and
                 if any conversation ends with an assistant message, it will raise an error. In such case, use `continue_final_message` instead.
             continue_final_message (bool, *optional*):
                 If this is set, the chat will be formatted so that the final
@@ -1421,7 +1438,7 @@ def apply_chat_template(
                 Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
                 If at least one conversation contains an image, its pixel values will be returned in the `pixel_values` key.
             kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonTokenizer.apply_chat_template`.
+                Not supported by `MistralCommonBackend.apply_chat_template`.
                 Will raise an error if used.
 
         Returns:
@@ -1430,7 +1447,7 @@ def apply_chat_template(
         """
         if kwargs:
             raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.apply_chat_template`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.apply_chat_template`."
             )
         if not isinstance(truncation, bool):
             raise TypeError("`truncation` must be a boolean for `apply_chat_template` method.")
@@ -1568,7 +1585,7 @@ def _maybe_adapt_message(message: dict[str, Any]) -> None:
 
         else:
             logger.warning(
-                "`MistralCommonTokenizer.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior."
+                "`MistralCommonBackend.apply_chat_template(..., tokenize=False)` is unsafe and may lead to unexpected behavior."
                 " Please consider using `tokenize=True` instead and don't encode the output manually."
             )
             return outputs
@@ -1604,18 +1621,18 @@ def __call__(
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of int
                 (encoded strings).
             text_pair (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
+                Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
             text_target (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
+                Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
             text_pair_target (`None`, *optional*):
-                Not supported by `MistralCommonTokenizer`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
+                Not supported by `MistralCommonBackend`. Kept to match the signature of `PreTrainedTokenizerBase.__call__`.
         """
         if kwargs:
-            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.__call__`.")
+            raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.__call__`.")
 
         if text_pair or text_target or text_pair_target:
             raise ValueError(
-                "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonTokenizer`."
+                "`text_pair`, `text_target` and `text_pair_target` are not supported by `MistralCommonBackend`."
             )
 
         def _is_valid_text_input(t):
@@ -1709,7 +1726,7 @@ def from_pretrained(
         **kwargs,
     ):
         r"""
-        Instantiate a `MistralCommonTokenizer` from a predefined
+        Instantiate a `MistralCommonBackend` from a predefined
         tokenizer.
 
         Args:
@@ -1718,7 +1735,7 @@ def from_pretrained(
 
                 - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                 - A path to a *directory* containing the tokenizer config, for instance saved
-                  using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g.,
+                  using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
             mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
                 Validation mode for the `MistralTokenizer` tokenizer.
@@ -1755,17 +1772,17 @@ def from_pretrained(
                 Whether or not the model should cleanup the spaces that were added when splitting the input text during the
                 tokenization process.
             kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonTokenizer.from_pretrained`.
+                Not supported by `MistralCommonBackend.from_pretrained`.
                 Will raise an error if used.
         """
         if init_inputs:
-            raise ValueError("`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`.")
+            raise ValueError("`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`.")
 
         # Handle kwargs and AutoTokenizer/AutoProcessor case
         if kwargs and not set(kwargs.keys()).issubset(
             {"trust_remote_code", "_from_pipeline", "_commit_hash", "dtype", "_from_auto"}
         ):
-            raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonTokenizer.from_pretrained`.")
+            raise ValueError(f"Some kwargs in {kwargs} are not supported by `MistralCommonBackend.from_pretrained`.")
 
         if not os.path.isdir(pretrained_model_name_or_path):
             tokenizer_path = download_tokenizer_from_hf_hub(
@@ -1832,7 +1849,7 @@ def save_pretrained(
 
 
         This method make sure the full tokenizer can then be re-loaded using the
-        [`~MistralCommonTokenizer.tokenization_mistral_common.from_pretrained`] class method.
+        [`~MistralCommonBackend.tokenization_mistral_common.from_pretrained`] class method.
 
         Args:
             save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
@@ -1847,7 +1864,7 @@ def save_pretrained(
             repo_id (`str`, *optional*): The name of the repository to which push to the Hub.
             private (`bool`, *optional*): Whether the model repository is private or not.
             kwargs (`Dict[str, Any]`, *optional*):
-                Not supported by `MistralCommonTokenizer.save_pretrained`.
+                Not supported by `MistralCommonBackend.save_pretrained`.
                 Will raise an error if used.
 
         Returns:
@@ -1855,7 +1872,7 @@ def save_pretrained(
         """
         if kwargs:
             raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.save_pretrained`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.save_pretrained`."
             )
 
         save_directory = Path(save_directory)
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_python.py
similarity index 55%
rename from src/transformers/tokenization_utils.py
rename to src/transformers/tokenization_python.py
index 3241b76fc843..d320cb5e9382 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_python.py
@@ -13,29 +13,22 @@
 # limitations under the License.
 """
 Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
-tokenization_utils_fast.py
+tokenization_utils_tokenizers.py
 """
 
 import bisect
-import itertools
-import re
 import unicodedata
 from collections import OrderedDict
 from typing import Any, overload
 
 from .tokenization_utils_base import (
-    ENCODE_KWARGS_DOCSTRING,
-    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
     INIT_TOKENIZER_DOCSTRING,
     AddedToken,
     BatchEncoding,
     EncodedInput,
-    EncodedInputPair,
     PreTokenizedInput,
-    PreTokenizedInputPair,
     PreTrainedTokenizerBase,
     TextInput,
-    TextInputPair,
     TruncationStrategy,
 )
 from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging
@@ -404,7 +397,7 @@ def _insert_one_token_to_ordered_list(token_list: list[str], new_token: str):
 
 
 @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
-class PreTrainedTokenizer(PreTrainedTokenizerBase):
+class PythonBackend(PreTrainedTokenizerBase):
     """
     Base class for all slow tokenizers.
 
@@ -430,29 +423,38 @@ def __init__(self, **kwargs):
         self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
         self._added_tokens_encoder: dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()}
 
-        # 4 init the parent class
+        # 4. Token type ID configuration for dynamic mask building
+        # These can be overridden by subclasses to avoid overriding create_token_type_ids_from_sequences
+        self.token_type_ids_pattern = kwargs.pop("token_type_ids_pattern", "bert_style")  # "all_zeros" or "bert_style"
+        self.token_type_ids_include_special_tokens = kwargs.pop("token_type_ids_include_special_tokens", True)
+
+        # 5. Special tokens mask configuration
+        # Patterns: "none", "cls_sep", "eos", "bos", "bos_eos", "cls_double_sep", "prefix_suffix"
+        self.special_tokens_pattern = kwargs.pop("special_tokens_pattern", "cls_sep")
+
+        # 6. Set backend to "custom" if not already set (for direct PreTrainedTokenizer subclasses)
+        if "backend" not in kwargs:
+            kwargs["backend"] = "custom"
+
+        # 7. init the parent class
         super().__init__(**kwargs)
 
+        if self._added_tokens_decoder:
+            self._update_total_vocab_size()
+
         # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
-        # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers`
+        # V5: the order of addition follows self.SPECIAL_TOKENS_ATTRIBUTES, then extra special tokens
+        # Note: _add_tokens will automatically skip tokens that are already in the base vocab
         self._add_tokens(
-            [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder],
+            [token for token in self.all_special_tokens if token not in self._added_tokens_encoder],
             special_tokens=True,
         )
-
-        self._decode_use_source_tokenizer = False
+        self._update_total_vocab_size()
 
     @property
     def is_fast(self) -> bool:
         return False
 
-    @property
-    def vocab_size(self) -> int:
-        """
-        `int`: Size of the base vocabulary (without the added tokens).
-        """
-        raise NotImplementedError
-
     @property
     def added_tokens_encoder(self) -> dict[str, int]:
         """
@@ -576,7 +578,7 @@ def _add_tokens(self, new_tokens: list[str] | list[AddedToken], special_tokens:
                 token_index = current_vocab[token.content]
 
             if token.special and str(token) not in self.all_special_tokens:
-                self._special_tokens_map["additional_special_tokens"].append(token)
+                self._extra_special_tokens.append(token)
             # the setter automatically updates the reverse map
             self._added_tokens_decoder[token_index] = token
             self._added_tokens_encoder[token.content] = token_index
@@ -622,81 +624,56 @@ def tokenize(self, text: TextInput, **kwargs) -> list[str]:
         """
         Converts a string into a sequence of tokens, using the tokenizer.
 
-        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
-        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
-
         Args:
-            text (`str`):
-                The sequence to be encoded.
-            **kwargs (additional keyword arguments):
-                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
+            text: The sequence to be encoded.
+            **kwargs: Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
 
         Returns:
-            `list[str]`: The list of tokens.
+            The list of tokens.
         """
         split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens)
-
         text, kwargs = self.prepare_for_tokenization(text, **kwargs)
 
-        if kwargs:
-            logger.warning(f"Keyword arguments {kwargs} not recognized.")
-
-        if hasattr(self, "do_lower_case") and self.do_lower_case:
-            # convert non-special tokens to lowercase. Might be super slow as well?
-            escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)]
-            escaped_special_toks += [
-                re.escape(s_tok.content)
-                for s_tok in (self._added_tokens_decoder.values())
-                if not s_tok.special and s_tok.normalized
-            ]
-            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
-            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
-
         if split_special_tokens:
-            no_split_token = []
-            tokens = [text]
-        else:
-            no_split_token = self._added_tokens_encoder.keys()  # don't split on any of the added tokens
-            # "This is something  else"
-            tokens = self.tokens_trie.split(text)
+            # Don't split on any tokens - just tokenize directly
+            return self._tokenize(text)
+
+        # Split on added tokens
+        tokens = self.tokens_trie.split(text)
+        no_split_token = self._added_tokens_encoder.keys()
 
-        # ["This is something", "", "  else"]
+        # Handle added token properties (lstrip, rstrip, single_word)
         for i, token in enumerate(tokens):
             if token in no_split_token:
-                tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None)
+                tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token])
                 left = tokens[i - 1] if i > 0 else None
                 right = tokens[i + 1] if i < len(tokens) - 1 else None
+
                 if isinstance(tok_extended, AddedToken):
                     if tok_extended.rstrip and right:
-                        # A bit counter-intuitive but we strip the left of the string
-                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
                         tokens[i + 1] = right.lstrip()
-                    # Strip white spaces on the left
                     if tok_extended.lstrip and left:
-                        tokens[i - 1] = left.rstrip()  # Opposite here
-                    if tok_extended.single_word and left and left[-1] != " ":
-                        tokens[i - 1] += token
-                        tokens[i] = ""
-                    elif tok_extended.single_word and right and right[0] != " ":
-                        tokens[i + 1] = token + tokens[i + 1]
-                        tokens[i] = ""
-                else:
-                    raise ValueError(
-                        f"{tok_extended} cannot be tokenized because it was not properly added"
-                        f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
-                    )
-        # ["This is something", "", "else"]
-        tokenized_text = []
+                        tokens[i - 1] = left.rstrip()
+                    if tok_extended.single_word:
+                        if left and left[-1] != " ":
+                            tokens[i - 1] += token
+                            tokens[i] = ""
+                        elif right and right[0] != " ":
+                            tokens[i + 1] = token + tokens[i + 1]
+                            tokens[i] = ""
+
+        # Tokenize non-added tokens
+        result = []
+        all_special_tokens_set = set(self.all_special_tokens)
         for token in tokens:
-            # Need to skip eventual empty (fully stripped) tokens
             if not token:
                 continue
-            if token in no_split_token:
-                tokenized_text.append(token)
+            if token in no_split_token or token in all_special_tokens_set:
+                result.append(token)
             else:
-                tokenized_text.extend(self._tokenize(token))
-        # ["This", " is", " something", "", "else"]
-        return tokenized_text
+                result.extend(self._tokenize(token))
+
+        return result
 
     def _tokenize(self, text, **kwargs):
         """
@@ -707,34 +684,9 @@ def _tokenize(self, text, **kwargs):
         """
         raise NotImplementedError
 
-    def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]:
-        """
-        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
-        vocabulary.
-
-        Args:
-            tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s).
-
-        Returns:
-            `int` or `list[int]`: The token id or list of token ids.
-        """
-        if tokens is None:
-            return None
-
-        if isinstance(tokens, str):
-            return self._convert_token_to_id_with_added_voc(tokens)
-
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        return ids
-
     def _convert_token_to_id_with_added_voc(self, token):
-        if token is None:
-            return None
-
-        if token in self._added_tokens_encoder:
-            return self._added_tokens_encoder[token]
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
         return self._convert_token_to_id(token)
 
     def _convert_token_to_id(self, token):
@@ -757,45 +709,104 @@ def _encode_plus(
         return_attention_mask: bool | None = None,
         return_overflowing_tokens: bool = False,
         return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
         **kwargs,
     ) -> BatchEncoding:
+        # Detect batched inputs (list of sequences)
+        is_batched = isinstance(text, (list, tuple)) and (
+            (not text and not is_split_into_words)
+            or (text and is_split_into_words and isinstance(text[0], (list, tuple)))
+            or (text and not is_split_into_words and isinstance(text[0], (str, list, tuple)))
+        )
+
+        if is_batched:
+            if text_pair is not None:
+                if not isinstance(text_pair, (list, tuple)) or len(text_pair) != len(text):
+                    raise ValueError("If `text` is a batch, `text_pair` must also be a batch of the same length.")
+            pairs = text_pair if text_pair is not None else [None] * len(text)
+
+            batch_outputs = {}
+            for current_text, current_pair in zip(text, pairs):
+                # Handle tuples/lists as sequence pairs like ("text1", "text2")
+                # For is_split_into_words=True: only unpack if it's a tuple of exactly 2 sequences (pair)
+                # Otherwise, treat the list as a single pretokenized sequence
+                if (
+                    isinstance(current_text, (list, tuple))
+                    and current_text
+                    and not isinstance(current_text[0], int)
+                    and current_pair is None
+                ):
+                    # Check if this looks like a pair: tuple/list of length 2 where elements are strings or lists/tuples
+                    is_pair = (
+                        len(current_text) == 2
+                        and (isinstance(current_text[0], str) or isinstance(current_text[0], (list, tuple)))
+                        and (isinstance(current_text[1], str) or isinstance(current_text[1], (list, tuple)))
+                    )
+                    if is_pair:
+                        current_text, current_pair = current_text
+                    elif len(current_text) == 1:
+                        current_text = current_text[0]
+                    elif not is_split_into_words:
+                        # Only raise error for non-pretokenized input
+                        raise ValueError(f"Expected a pair of sequences, got {len(current_text)} sequences.")
+
+                current_output = self._encode_plus(
+                    text=current_text,
+                    text_pair=current_pair,
+                    add_special_tokens=add_special_tokens,
+                    padding_strategy=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
+                    truncation_strategy=truncation_strategy,
+                    max_length=max_length,
+                    stride=stride,
+                    is_split_into_words=is_split_into_words,
+                    pad_to_multiple_of=None,  # we pad in batch afterward
+                    padding_side=None,  # we pad in batch afterward
+                    return_tensors=None,  # We convert the whole batch to tensors at the end
+                    return_token_type_ids=return_token_type_ids,
+                    return_attention_mask=False,  # we pad in batch afterward
+                    return_overflowing_tokens=return_overflowing_tokens,
+                    return_special_tokens_mask=return_special_tokens_mask,
+                    return_length=return_length,
+                    verbose=verbose,
+                    **kwargs,
+                )
+                for key, value in current_output.items():
+                    batch_outputs.setdefault(key, []).append(value)
+
+            # Remove overflow-related keys before tensor conversion if return_tensors is set
+            # Slow tokenizers don't support returning these as tensors
+            if return_tensors and return_overflowing_tokens:
+                batch_outputs.pop("overflowing_tokens", None)
+                batch_outputs.pop("num_truncated_tokens", None)
+
+            batch_outputs = self.pad(
+                batch_outputs,
+                padding=padding_strategy.value,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+            return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        # Single sequence handling
         def get_input_ids(text):
             if isinstance(text, str):
-                tokens = self.tokenize(text, **kwargs)
-                return self.convert_tokens_to_ids(tokens)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                if is_split_into_words:
-                    tokens = list(
-                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
-                    )
-                    return self.convert_tokens_to_ids(tokens)
-                else:
+                # Normal case: tokenize string
+                return self.convert_tokens_to_ids(self.tokenize(text, **kwargs))
+            if isinstance(text, (list, tuple)) and text:
+                if isinstance(text[0], int):
+                    return text
+                # Pre-tokenized strings
+                if isinstance(text[0], str):
+                    if is_split_into_words:
+                        return self.convert_tokens_to_ids(
+                            [tok for word in text for tok in self.tokenize(word, **kwargs)]
+                        )
                     return self.convert_tokens_to_ids(text)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
-            else:
-                if is_split_into_words:
-                    raise ValueError(
-                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when"
-                        " `is_split_into_words=True`."
-                    )
-                else:
-                    raise ValueError(
-                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of"
-                        " integers."
-                    )
-
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast. "
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+            raise ValueError(f"Input must be a string, list of strings, or list of ints, got: {type(text)}")
 
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
@@ -820,165 +831,6 @@ def get_input_ids(text):
             verbose=verbose,
         )
 
-    def _batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: list[TextInput]
-        | list[TextInputPair]
-        | list[PreTokenizedInput]
-        | list[PreTokenizedInputPair]
-        | list[EncodedInput]
-        | list[EncodedInputPair],
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: int | None = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: int | None = None,
-        padding_side: str | None = None,
-        return_tensors: str | TensorType | None = None,
-        return_token_type_ids: bool | None = None,
-        return_attention_mask: bool | None = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        split_special_tokens: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        def get_input_ids(text):
-            if isinstance(text, str):
-                tokens = self.tokenize(text, **kwargs)
-                return self.convert_tokens_to_ids(tokens)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                if is_split_into_words:
-                    tokens = list(
-                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
-                    )
-                    return self.convert_tokens_to_ids(tokens)
-                else:
-                    return self.convert_tokens_to_ids(text)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
-            else:
-                raise ValueError(
-                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-                )
-
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers. "
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-            )
-
-        input_ids = []
-        for ids_or_pair_ids in batch_text_or_text_pairs:
-            if (
-                not isinstance(ids_or_pair_ids, (list, tuple))
-                or is_split_into_words
-                and not isinstance(ids_or_pair_ids[0], (list, tuple))
-            ):
-                ids, pair_ids = ids_or_pair_ids, None
-            else:
-                ids, pair_ids = ids_or_pair_ids
-
-            first_ids = get_input_ids(ids)
-            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
-            input_ids.append((first_ids, second_ids))
-
-        batch_outputs = self._batch_prepare_for_model(
-            input_ids,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_length=return_length,
-            return_tensors=return_tensors,
-            verbose=verbose,
-            split_special_tokens=split_special_tokens,
-        )
-
-        return BatchEncoding(batch_outputs)
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def _batch_prepare_for_model(
-        self,
-        batch_ids_pairs: list[PreTokenizedInputPair | tuple[list[int], None]],
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: int | None = None,
-        stride: int = 0,
-        pad_to_multiple_of: int | None = None,
-        padding_side: str | None = None,
-        return_tensors: str | None = None,
-        return_token_type_ids: bool | None = None,
-        return_attention_mask: bool | None = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        split_special_tokens: bool = False,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens
-
-        Args:
-            batch_ids_pairs: list of tokenized input ids or input ids pairs
-        """
-
-        batch_outputs = {}
-        for first_ids, second_ids in batch_ids_pairs:
-            outputs = self.prepare_for_model(
-                first_ids,
-                second_ids,
-                add_special_tokens=add_special_tokens,
-                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
-                truncation=truncation_strategy.value,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=None,  # we pad in batch afterward
-                padding_side=None,  # we pad in batch afterward
-                return_attention_mask=False,  # we pad in batch afterward
-                return_token_type_ids=return_token_type_ids,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_length=return_length,
-                return_tensors=None,  # We convert the whole batch to tensors at the end
-                prepend_batch_axis=False,
-                verbose=verbose,
-                split_special_tokens=split_special_tokens,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        batch_outputs = self.pad(
-            batch_outputs,
-            padding=padding_strategy.value,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_attention_mask=return_attention_mask,
-        )
-
-        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-        return batch_outputs
-
     def prepare_for_tokenization(
         self, text: str, is_split_into_words: bool = False, **kwargs
     ) -> tuple[str, dict[str, Any]]:
@@ -1003,6 +855,79 @@ def prepare_for_tokenization(
         """
         return (text, kwargs)
 
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: list[int] | None = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequences by adding special tokens.
+
+        This method dynamically builds inputs based on the tokenizer's `special_tokens_pattern`:
+        - `"none"`: No special tokens
+        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
+        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+        - `"prefix_suffix"`: ` seq0 [seq1] ` (custom prefix/suffix stored on the tokenizer)
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of input IDs with the appropriate special tokens.
+        """
+        if self.special_tokens_pattern == "cls_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + token_ids_1 + [self.sep_token_id]
+
+        elif self.special_tokens_pattern == "eos":
+            # seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return token_ids_0 + [self.eos_token_id]
+            return token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+        elif self.special_tokens_pattern == "bos":
+            # [BOS] seq0 or [BOS] seq0 [BOS] seq1
+            if token_ids_1 is None:
+                return [self.bos_token_id] + token_ids_0
+            return [self.bos_token_id] + token_ids_0 + [self.bos_token_id] + token_ids_1
+
+        elif self.special_tokens_pattern == "bos_eos":
+            # [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+            return [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+        elif self.special_tokens_pattern == "cls_double_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+            return (
+                [self.cls_token_id]
+                + token_ids_0
+                + [self.sep_token_id, self.sep_token_id]
+                + token_ids_1
+                + [self.sep_token_id]
+            )
+
+        elif self.special_tokens_pattern == "prefix_suffix":
+            prefix_tokens = getattr(self, "prefix_tokens", [])
+            suffix_tokens = getattr(self, "suffix_tokens", [])
+            if token_ids_1 is None:
+                return prefix_tokens + token_ids_0 + suffix_tokens
+            return prefix_tokens + token_ids_0 + token_ids_1 + suffix_tokens
+
+        else:  # "none" or any other value
+            # No special tokens
+            if token_ids_1 is None:
+                return token_ids_0
+            return token_ids_0 + token_ids_1
+
     def get_special_tokens_mask(
         self, token_ids_0: list, token_ids_1: list | None = None, already_has_special_tokens: bool = False
     ) -> list[int]:
@@ -1010,6 +935,15 @@ def get_special_tokens_mask(
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
+        This method dynamically builds the special tokens mask based on the tokenizer's `special_tokens_pattern`:
+        - `"none"`: No special tokens (default, returns all 0s)
+        - `"cls_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+        - `"eos"`: seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+        - `"bos"`: [BOS] seq0 or [BOS] seq0 [BOS] seq1
+        - `"bos_eos"`: [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+        - `"cls_double_sep"`: [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+        - `"prefix_suffix"`: ` seq0 [seq1] `
+
         Args:
             token_ids_0 (`list[int]`):
                 List of ids of the first sequence.
@@ -1031,7 +965,48 @@ def get_special_tokens_mask(
             return super().get_special_tokens_mask(
                 token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
             )
-        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+
+        if self.special_tokens_pattern == "cls_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+        elif self.special_tokens_pattern == "eos":
+            # seq0 [EOS] or seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return ([0] * len(token_ids_0)) + [1]
+            return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+        elif self.special_tokens_pattern == "bos":
+            # [BOS] seq0 or [BOS] seq0 [BOS] seq1
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0))
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
+
+        elif self.special_tokens_pattern == "bos_eos":
+            # [BOS] seq0 [EOS] or [BOS] seq0 [EOS] seq1 [EOS]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+        elif self.special_tokens_pattern == "cls_double_sep":
+            # [CLS] seq0 [SEP] or [CLS] seq0 [SEP] [SEP] seq1 [SEP]
+            if token_ids_1 is None:
+                return [1] + ([0] * len(token_ids_0)) + [1]
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+        elif self.special_tokens_pattern == "prefix_suffix":
+            prefix_len = len(getattr(self, "prefix_tokens", []))
+            suffix_len = len(getattr(self, "suffix_tokens", []))
+            mask = [1] * prefix_len + ([0] * len(token_ids_0))
+            if token_ids_1 is not None:
+                mask += [0] * len(token_ids_1)
+            mask += [1] * suffix_len
+            return mask
+
+        else:
+            return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
     @overload
     def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ...
@@ -1054,19 +1029,22 @@ def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool
             `str` or `list[str]`: The decoded token(s).
         """
         if isinstance(ids, int):
-            if ids in self._added_tokens_decoder:
-                return self._added_tokens_decoder[ids].content
-            else:
-                return self._convert_id_to_token(ids)
+            return (
+                self._added_tokens_decoder[ids].content
+                if ids in self._added_tokens_decoder
+                else self._convert_id_to_token(ids)
+            )
+
         tokens = []
         for index in ids:
             index = int(index)
             if skip_special_tokens and index in self.all_special_ids:
                 continue
-            if index in self._added_tokens_decoder:
-                tokens.append(self._added_tokens_decoder[index].content)
-            else:
-                tokens.append(self._convert_id_to_token(index))
+            tokens.append(
+                self._added_tokens_decoder[index].content
+                if index in self._added_tokens_decoder
+                else self._convert_id_to_token(index)
+            )
         return tokens
 
     def _convert_id_to_token(self, index: int) -> str:
@@ -1080,52 +1058,343 @@ def _decode(
         token_ids: int | list[int],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: bool | None = None,
-        spaces_between_special_tokens: bool = True,
         **kwargs,
     ) -> str:
-        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-
+        """Decode token ids to string."""
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-        # If given is a single id, prevents splitting the string in upcoming loop
         if isinstance(filtered_tokens, str):
             filtered_tokens = [filtered_tokens]
 
-        legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
-            token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
-        }
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separately for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_tokens:
-                continue
-            if token in legacy_added_tokens:
-                if current_sub_text:
-                    string = self.convert_tokens_to_string(current_sub_text)
-                    if len(string) > 0:
-                        sub_texts.append(string)
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
-        if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-
-        if spaces_between_special_tokens:
-            text = " ".join(sub_texts)
-        else:
-            text = "".join(sub_texts)
+        text = self.convert_tokens_to_string(filtered_tokens)
 
+        # Apply tokenizer-specific cleanup if available and requested
         clean_up_tokenization_spaces = (
             clean_up_tokenization_spaces
             if clean_up_tokenization_spaces is not None
             else self.clean_up_tokenization_spaces
         )
         if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
+            # Call custom cleanup method if it exists (e.g., for CLVP's [SPACE] token replacement)
+            if hasattr(self, "clean_up_tokenization") and callable(self.clean_up_tokenization):
+                text = self.clean_up_tokenization(text)
+            else:
+                # Otherwise apply standard cleanup
+                text = (
+                    text.replace(" .", ".")
+                    .replace(" ?", "?")
+                    .replace(" !", "!")
+                    .replace(" ,", ",")
+                    .replace(" ' ", "'")
+                    .replace(" n't", "n't")
+                    .replace(" 'm", "'m")
+                    .replace(" 's", "'s")
+                    .replace(" 've", "'ve")
+                    .replace(" 're", "'re")
+                )
+
+        return text
+
+    def prepare_for_model(
+        self,
+        ids: list[int],
+        pair_ids: list[int] | None = None,
+        add_special_tokens: bool = True,
+        padding: bool | str | PaddingStrategy = False,
+        truncation: bool | str | TruncationStrategy = False,
+        max_length: int | None = None,
+        stride: int = 0,
+        pad_to_multiple_of: int | None = None,
+        padding_side: str | None = None,
+        return_tensors: str | TensorType | None = None,
+        return_token_type_ids: bool | None = None,
+        return_attention_mask: bool | None = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input ids so it can be used by the model. Adds special tokens, truncates, and pads.
+
+        Args:
+            ids: Tokenized input ids of the first sequence.
+            pair_ids: Tokenized input ids of the second sequence (optional).
+        """
+        # Get padding/truncation strategies
+        padding_strategy, truncation_strategy, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        # Validation
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        # Truncation
+        pair = pair_ids is not None
+        num_special = self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0
+        total_len = len(ids) + len(pair_ids or []) + num_special
+
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+        else:
+            sequence = ids + (pair_ids if pair_ids else [])
+            token_type_ids = [0] * len(sequence)
+
+        # Build output
+        encoded_inputs = {"input_ids": sequence}
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            encoded_inputs["special_tokens_mask"] = (
+                self.get_special_tokens_mask(ids, pair_ids) if add_special_tokens else [0] * len(sequence)
+            )
+        if return_overflowing_tokens and not return_tensors and overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length if max_length else 0
+
+        # Check sequence length and warn if needed
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Pad
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        return BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis)
+
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        pair_ids: list[int] | None = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: str | TruncationStrategy = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """Truncates sequences according to the specified strategy."""
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+
+        # ONLY_FIRST or LONGEST_FIRST with single sequence
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            if self.truncation_side == "left":
+                overflowing_tokens = ids[:window_len]
+                ids = ids[num_tokens_to_remove:]
+            else:
+                overflowing_tokens = ids[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+
+        # LONGEST_FIRST with pair
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            logger.warning(
+                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                "truncation strategy. So the returned list will always be empty even if some "
+                "tokens have been removed."
+            )
+            len_ids, len_pair = len(ids), len(pair_ids) if pair_ids else 0
+            first_remove = min(abs(len_pair - len_ids), num_tokens_to_remove)
+            second_remove = num_tokens_to_remove - first_remove
+
+            if len_ids > len_pair:
+                ids_to_move = first_remove + second_remove // 2
+                pair_ids_to_move = second_remove - second_remove // 2
+            else:
+                ids_to_move = second_remove // 2
+                pair_ids_to_move = first_remove + second_remove - (second_remove // 2)
+
+            if self.truncation_side == "right":
+                ids = ids[:-ids_to_move] if ids_to_move > 0 else ids
+                pair_ids = pair_ids[:-pair_ids_to_move] if pair_ids and pair_ids_to_move > 0 else pair_ids
+            else:
+                ids = ids[ids_to_move:]
+                pair_ids = pair_ids[pair_ids_to_move:] if pair_ids else None
+
+        # ONLY_SECOND
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids:
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            if self.truncation_side == "right":
+                overflowing_tokens = pair_ids[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+            else:
+                overflowing_tokens = pair_ids[:window_len]
+                pair_ids = pair_ids[num_tokens_to_remove:]
+
+        return ids, pair_ids, overflowing_tokens
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: list[int] | None = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        This method dynamically builds the token type IDs based on the tokenizer's configuration attributes:
+        - `token_type_ids_pattern`: Pattern to use ("all_zeros" or "bert_style")
+        - `token_type_ids_include_special_tokens`: Whether to account for special tokens in length calculation
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: Token type IDs according to the configured pattern.
+
+        Examples:
+            ```python
+            # All zeros pattern (default, used by RoBERTa, BART, etc.)
+            tokenizer.token_type_ids_pattern = "all_zeros"
+            # Returns: [0, 0, 0, ...] for both sequences
+
+            # BERT-style pattern (first sequence gets 0s, second gets 1s)
+            tokenizer.token_type_ids_pattern = "bert_style"
+            # Returns: [0, 0, 0, ..., 1, 1, 1, ...] for sequence pairs
+            ```
+        """
+        # Calculate lengths - account for special tokens if configured
+        if self.token_type_ids_include_special_tokens:
+            # Build the full sequence to get accurate length
+            if token_ids_1 is None:
+                sequence = self.build_inputs_with_special_tokens(token_ids_0)
+                seq0_len = len(sequence)
+                seq1_len = 0
+            else:
+                full_sequence = self.build_inputs_with_special_tokens(token_ids_0, token_ids_1)
+                # Approximate split - this works for most tokenizers
+                # For more complex cases, subclasses should still override
+                seq0_with_special = self.build_inputs_with_special_tokens(token_ids_0)
+                seq0_len = len(seq0_with_special)
+                seq1_len = len(full_sequence) - seq0_len
+        else:
+            # Use raw token lengths
+            seq0_len = len(token_ids_0)
+            seq1_len = len(token_ids_1) if token_ids_1 is not None else 0
+
+        # Build token type IDs based on pattern
+        if self.special_tokens_pattern == "prefix_suffix":
+            total_len = len(getattr(self, "prefix_tokens", [])) + len(token_ids_0)
+            if token_ids_1 is not None:
+                total_len += len(token_ids_1)
+            total_len += len(getattr(self, "suffix_tokens", []))
+            return [0] * total_len
+
+        if self.token_type_ids_pattern == "bert_style" and token_ids_1 is not None:
+            # BERT-style: first sequence gets 0s, second sequence gets 1s
+            return [0] * seq0_len + [1] * seq1_len
         else:
-            return text
+            # All zeros pattern (default): everything gets 0s
+            return [0] * (seq0_len + seq1_len)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: str | None = None) -> tuple[str, ...]:
+        """
+        Default implementation for common vocabulary saving patterns.
+        Saves self.encoder/self.vocab as JSON, optionally with self.bpe_ranks as merges.
+        Returns empty tuple if no vocabulary exists.
+
+        Override this method if your tokenizer needs custom saving logic (e.g., SentencePiece models,
+        multiple vocabulary files, or special file formats).
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `tuple[str, ...]`: Paths to the files saved, or empty tuple if no files saved.
+        """
+        import json
+        import os
+
+        vocab_attr = getattr(self, "encoder", None) or getattr(self, "vocab", None)
+        if vocab_attr is None:
+            return ()
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return ()
+
+        vocab_files_names = getattr(self, "vocab_files_names", {})
+        prefix = f"{filename_prefix}-" if filename_prefix else ""
+
+        # Save vocabulary
+        vocab_file = os.path.join(save_directory, prefix + vocab_files_names.get("vocab_file", "vocab.json"))
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(vocab_attr, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        # Save BPE merges if present
+        bpe_ranks = getattr(self, "bpe_ranks", None)
+        if bpe_ranks is None:
+            return (vocab_file,)
+
+        merge_file = os.path.join(save_directory, prefix + vocab_files_names.get("merges_file", "merges.txt"))
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            if getattr(self, "add_bpe_version_header", False):
+                writer.write("#version: 0.2\n")
+
+            index = 0
+            for bpe_tokens, token_index in sorted(bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return (vocab_file, merge_file)
+
+
+# Backward compatibility alias
+PreTrainedTokenizer = PythonBackend
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 23d502b8ca8c..007e73258f47 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1,3 +1,4 @@
+# base
 # coding=utf-8
 # Copyright 2020 The HuggingFace Inc. team.
 #
@@ -18,19 +19,16 @@
 of output with special method for the Fast tokenizers)
 """
 
-from __future__ import annotations
-
 import copy
 import json
 import os
 import re
 import warnings
-from collections import UserDict
+from collections import OrderedDict, UserDict
 from collections.abc import Callable, Mapping, Sequence, Sized
-from contextlib import contextmanager
 from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, NamedTuple, Optional, Union, overload
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 
 import numpy as np
 from huggingface_hub import create_repo, list_repo_files
@@ -62,7 +60,6 @@
     requires_backends,
     to_py_obj,
 )
-from .utils.chat_parsing_utils import recursive_parse
 from .utils.chat_template_utils import render_jinja_template
 from .utils.import_utils import PROTOBUF_IMPORT_ERROR
 
@@ -250,14 +247,6 @@ def n_sequences(self) -> Optional[int]:
         """
         return self._n_sequences
 
-    @property
-    def is_fast(self) -> bool:
-        """
-        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`]
-        or not.
-        """
-        return self._encodings is not None
-
     def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
         """
         If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
@@ -300,6 +289,13 @@ def __setstate__(self, state):
     # Extended properties and methods only available for fast (Rust-based) tokenizers
     # provided by HuggingFace tokenizers library.
 
+    @property
+    def is_fast(self) -> bool:
+        """
+        TOOD: ita i will rm this `bool`: Whether or not this BatchEncoding was created by a fast tokenizer.
+        """
+        return self._encodings is not None
+
     @property
     def encodings(self) -> Optional[list[EncodingFast]]:
         """
@@ -350,30 +346,6 @@ def sequence_ids(self, batch_index: int = 0) -> list[Optional[int]]:
             )
         return self._encodings[batch_index].sequence_ids
 
-    def words(self, batch_index: int = 0) -> list[Optional[int]]:
-        """
-        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
-
-        Args:
-            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
-
-        Returns:
-            `list[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the
-            tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word
-            (several tokens will be mapped to the same word index if they are parts of that word).
-        """
-        if not self._encodings:
-            raise ValueError(
-                "words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`"
-                " class)."
-            )
-        warnings.warn(
-            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
-            "but more self-explanatory `BatchEncoding.word_ids()` property.",
-            FutureWarning,
-        )
-        return self.word_ids(batch_index)
-
     def word_ids(self, batch_index: int = 0) -> list[Optional[int]]:
         """
         Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
@@ -784,7 +756,7 @@ def as_tensor(value, dtype=None):
 
         return self
 
-    def to(self, device: Union[str, torch.device], *, non_blocking: bool = False) -> BatchEncoding:
+    def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding":
         """
         Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only).
 
@@ -810,13 +782,154 @@ def to(self, device: Union[str, torch.device], *, non_blocking: bool = False) ->
         return self
 
 
-class SpecialTokensMixin:
-    """
-    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to
-    special tokens. In particular, this class hold the attributes which can be used to directly access these special
-    tokens in a model-independent manner and allow to set and update the special tokens.
+ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to add special tokens when encoding the sequences. This will use the underlying
+                `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
+                automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
+                automatically.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence is provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`)
+"""
+
+
+INIT_TOKENIZER_DOCSTRING = r"""
+    Class attributes (overridden by derived classes)
+
+        - **vocab_files_names** (`dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
+          vocabulary file required by the model, and as associated values, the filename for saving the associated file
+          (string).
+        - **pretrained_vocab_files_map** (`dict[str, dict[str, str]]`) -- A dictionary of dictionaries, with the
+          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
+          low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
+          associated pretrained vocabulary file.
+        - **model_input_names** (`list[str]`) -- A list of inputs expected in the forward pass of the model.
+        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
+          Should be `'right'` or `'left'`.
+        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
+          applied. Should be `'right'` or `'left'`.
 
     Args:
+        model_max_length (`int`, *optional*):
+            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
+            loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
+            value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
+            default to VERY_LARGE_INTEGER (`int(1e30)`).
+        padding_side (`str`, *optional*):
+            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+            Default value is picked from the class attribute of the same name.
+        truncation_side (`str`, *optional*):
+            The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
+            Default value is picked from the class attribute of the same name.
+        chat_template (`str`, *optional*):
+            A Jinja template string that will be used to format lists of chat messages. See
+            https://huggingface.co/docs/transformers/chat_templating for a full description.
+        model_input_names (`list[string]`, *optional*):
+            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
+            `"attention_mask"`). Default value is picked from the class attribute of the same name.
         bos_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing the beginning of a sentence.
         eos_token (`str` or `tokenizers.AddedToken`, *optional*):
@@ -832,12 +945,38 @@ class SpecialTokensMixin:
             A special token representing the class of the input (used by BERT for instance).
         mask_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing a masked token (used by masked-language modeling pretraining objectives, like
-            BERT).
-        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
-            skipped when decoding if `skip_special_tokens` is set to `True`.
+            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
+        extra_special_tokens (list of `str` or `tokenizers.AddedToken`, *optional*):
+            A list of extra model-specific special tokens. Add them here to ensure they are skipped when decoding with
+            `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end
+            of the vocabulary.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. Passing will affect the
+            internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
+            `` is the `bos_token`, then `tokenizer.tokenize("") = ['`]. Otherwise, if
+            `split_special_tokens=True`, then `tokenizer.tokenize("")` will be give `['<','s', '>']`.
+"""
+
+
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class PreTrainedTokenizerBase(PushToHubMixin):
+    """
+    Base class for all tokenizer backends.
     """
 
+    vocab_files_names: dict[str, str] = {}
+    pretrained_vocab_files_map: dict[str, dict[str, str]] = {}
+    _auto_class: Optional[str] = None
+
+    # first name has to correspond to main model input name
+    # to make sure `tokenizer.pad(...)` works correctly
+    model_input_names: list[str] = ["input_ids", "token_type_ids", "attention_mask"]
+    padding_side: str = "right"
+    truncation_side: str = "right"
+    slow_tokenizer_class = None
+
+    # Special tokens support (moved from SpecialTokensMixin)
+    # V5: Clean separation of named special tokens from extra special tokens
     SPECIAL_TOKENS_ATTRIBUTES = [
         "bos_token",
         "eos_token",
@@ -846,82 +985,161 @@ class SpecialTokensMixin:
         "pad_token",
         "cls_token",
         "mask_token",
-        "additional_special_tokens",
     ]
 
-    def __init__(self, verbose=False, **kwargs):
-        self._pad_token_type_id = 0
-        self.verbose = verbose
-        self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES)
-        self._special_tokens_map["additional_special_tokens"] = []  # for BC where it defaults to empty list
+    def __init__(self, **kwargs):
+        self.init_inputs = ()
+        for key in kwargs:
+            if hasattr(self, key) and callable(getattr(self, key)):
+                raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}")
 
-        # We directly set the hidden value to allow initialization with special tokens
-        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
-        # TODO clean this up at some point (probably by switching to fast tokenizers)
+        self.init_kwargs = copy.deepcopy(kwargs)
+        self.name_or_path = kwargs.pop("name_or_path", "")
+        self._processor_class = kwargs.pop("processor_class", None)
+        # Store additional_special_tokens in init_kwargs before conversion for backward compatibility
+        additional_special_tokens_value = kwargs.pop("additional_special_tokens", None)
+        if "additional_special_tokens" not in self.init_kwargs:
+            self.init_kwargs["additional_special_tokens"] = additional_special_tokens_value
+        kwargs.setdefault("extra_special_tokens", additional_special_tokens_value)
 
-        for key, value in kwargs.items():
-            if value is None:
-                continue
+        self._pad_token_type_id = 0
+        self.verbose = kwargs.pop("verbose", False)
+
+        # V5: Separate storage for named special tokens and extra special tokens
+        self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES)
+        self._extra_special_tokens = []  # List of extra model-specific special tokens
+
+        # V5: track both explicit and auto-detected model-specific tokens
+        explicit_model_specific_tokens = kwargs.pop("model_specific_special_tokens", None)
+        if explicit_model_specific_tokens is None:
+            explicit_model_specific_tokens = {}
+        elif not isinstance(explicit_model_specific_tokens, dict):
+            raise TypeError("model_specific_special_tokens must be a dictionary of token name to token value")
+        auto_model_specific_tokens = {}
+
+        # Directly set hidden values to allow init with tokens not yet in vocab
+        for key in list(kwargs.keys()):
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
-                    assert all(isinstance(t, (str, AddedToken)) for t in value), (
-                        "One of the tokens is not a string or an AddedToken"
-                    )
-                    setattr(self, key, value)
-                elif isinstance(value, (str, AddedToken)):
-                    setattr(self, key, value)
+                value = kwargs.pop(key)
+                if value is None:
+                    continue
+                if isinstance(value, (str, AddedToken)):
+                    self._special_tokens_map[key] = value
                 else:
                     raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
+            elif key == "extra_special_tokens":
+                # V5: Support extra_special_tokens in __init__
+                value = kwargs.pop(key)
+                if value is None:
+                    continue
+                # If dict: treat as model specific named special tokens (attributes)
+                if isinstance(value, dict):
+                    self._set_model_specific_special_tokens(special_tokens=value)
+                else:
+                    if not isinstance(value, (list, tuple)) or not all(
+                        isinstance(t, (str, AddedToken)) for t in value
+                    ):
+                        raise TypeError(
+                            "extra_special_tokens must be a list/tuple of str or AddedToken, or a dict mapping names to tokens"
+                        )
+                    self._extra_special_tokens = list(value)
+            elif (
+                key.endswith("_token")
+                and key not in self.SPECIAL_TOKENS_ATTRIBUTES
+                and isinstance(kwargs[key], (str, AddedToken))
+            ):
+                value = kwargs.pop(key)
+                if value is None:
+                    continue
+                auto_model_specific_tokens[key] = value
 
-    def sanitize_special_tokens(self) -> int:
-        """
-        The `sanitize_special_tokens` is now deprecated kept for backward compatibility and will be removed in
-        transformers v5.
-        """
-        logger.warning_once("The `sanitize_special_tokens` will be removed in transformers v5.")
-        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
+        # For backward compatibility we fallback to set model_max_length from max_len if provided
+        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
+        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
 
-    def add_special_tokens(
-        self,
-        special_tokens_dict: dict[str, Union[str, AddedToken, Sequence[Union[str, AddedToken]]]],
-        replace_additional_special_tokens=True,
-    ) -> int:
-        """
-        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
-        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
-        current vocabulary).
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        if self.padding_side not in ["right", "left"]:
+            raise ValueError(
+                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
+            )
 
-        When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
-        model so that its embedding matrix matches the tokenizer.
+        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
+        if self.truncation_side not in ["right", "left"]:
+            raise ValueError(
+                f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
+            )
 
-        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
-        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+        # By default, clean up tokenization spaces for both fast and slow tokenizers
+        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
 
-        - Special tokens can be skipped when decoding using `skip_special_tokens = True`.
-        - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`.
-        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
-          makes it easy to develop model-agnostic training and fine-tuning scripts.
+        # By default, do not split special tokens for both fast and slow tokenizers
+        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
 
-        When possible, special tokens are already registered for provided pretrained models (for instance
-        [`BertTokenizer`] `cls_token` is already registered to be `'[CLS]'` and XLM's one is also registered to be
+        self._in_target_context_manager = False
+
+        self.chat_template = kwargs.pop("chat_template", None)
+        if isinstance(self.chat_template, (list, tuple)):
+            # Chat templates are stored as lists of dicts with fixed key names,
+            # we reconstruct that into a single dict while loading them.
+            self.chat_template = {template["name"]: template["template"] for template in self.chat_template}
+
+        model_specific_tokens = {**auto_model_specific_tokens, **explicit_model_specific_tokens}
+        if model_specific_tokens:
+            self._set_model_specific_special_tokens(special_tokens=model_specific_tokens)
+
+        self.deprecation_warnings = {}
+
+        # Backend information (V5: tracking which backend and files were used)
+        self.backend = kwargs.pop("backend", None)
+        self.files_loaded = kwargs.pop("files_loaded", [])
+
+    def _set_processor_class(self, processor_class: str):
+        """Sets processor class so it can be serialized in `tokenizer_config.json`."""
+        self._processor_class = processor_class
+
+    # ---- Special tokens API (moved from SpecialTokensMixin) ----
+    def add_special_tokens(
+        self,
+        special_tokens_dict: dict[str, Union[str, AddedToken, Sequence[Union[str, AddedToken]]]],
+        replace_extra_special_tokens=True,
+    ) -> int:
+        """
+        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
+        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
+        current vocabulary).
+
+        When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the
+        model so that its embedding matrix matches the tokenizer.
+
+        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
+
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - Special tokens can be skipped when decoding using `skip_special_tokens = True`.
+        - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`.
+        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
+          makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (for instance
+        [`BertTokenizer`] `cls_token` is already registered to be `'[CLS]'` and XLM's one is also registered to be
         `''`).
 
         Args:
             special_tokens_dict (dictionary *str* to *str*, `tokenizers.AddedToken`, or `Sequence[Union[str, AddedToken]]`):
                 Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`,
-                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`].
+                `sep_token`, `pad_token`, `cls_token`, `mask_token`, `extra_special_tokens`].
 
                 Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
                 assign the index of the `unk_token` to them).
-            replace_additional_special_tokens (`bool`, *optional*, defaults to `True`):
-                If `True`, the existing list of additional special tokens will be replaced by the list provided in
-                `special_tokens_dict`. Otherwise, `self._special_tokens_map["additional_special_tokens"]` is just extended. In the former
+            replace_extra_special_tokens (`bool`, *optional*, defaults to `True`):
+                If `True`, the existing list of extra special tokens will be replaced by the list provided in
+                `special_tokens_dict`. Otherwise, `extra_special_tokens` will be extended. In the former
                 case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged
                 as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the
                 `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous
-                `additional_special_tokens` are still added tokens, and will not be split by the model.
+                `extra_special_tokens` are still added tokens, and will not be split by the model.
 
         Returns:
             `int`: Number of tokens added to the vocabulary.
@@ -945,60 +1163,56 @@ def add_special_tokens(
         if not special_tokens_dict:
             return 0
 
-        added_tokens = []
+        # V5: Allowed keys are SPECIAL_TOKENS_ATTRIBUTES + "extra_special_tokens"
+        # Backward compatibility: convert "additional_special_tokens" to "extra_special_tokens"
+        special_tokens_dict = dict(special_tokens_dict)
+        if "additional_special_tokens" in special_tokens_dict and "extra_special_tokens" not in special_tokens_dict:
+            special_tokens_dict["extra_special_tokens"] = special_tokens_dict.pop("additional_special_tokens")
+
+        allowed_keys = set(self.SPECIAL_TOKENS_ATTRIBUTES) | {"extra_special_tokens"}
+        tokens_to_add = []
         for key, value in special_tokens_dict.items():
-            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
+            if key not in allowed_keys:
+                raise ValueError(f"Key {key} is not a valid special token. Valid keys are: {allowed_keys}")
 
             if self.verbose:
                 logger.info(f"Assigning {value} to the {key} key of the tokenizer")
 
-            if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(isinstance(t, (str, AddedToken)) for t in value), (
-                    f"Tokens {value} for key {key} should all be str or AddedToken instances"
-                )
-
-                to_add = []
-                for token in value:
-                    if isinstance(token, str):
-                        # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this
-                        token = AddedToken(token, rstrip=False, lstrip=False, normalized=False, special=True)
-                    if not replace_additional_special_tokens and str(token) in self.additional_special_tokens:
-                        continue
-                    to_add.append(token)
-                if replace_additional_special_tokens and len(to_add) > 0:
-                    setattr(self, key, list(to_add))
+            if key == "extra_special_tokens":
+                if not isinstance(value, (list, tuple)) or not all(isinstance(t, (str, AddedToken)) for t in value):
+                    raise ValueError(f"Tokens {value} for key {key} should all be str or AddedToken instances")
+                new_tokens = [
+                    (
+                        AddedToken(t, rstrip=False, lstrip=False, normalized=False, special=True)
+                        if isinstance(t, str)
+                        else t
+                    )
+                    for t in value
+                    if replace_extra_special_tokens or str(t) not in self.extra_special_tokens
+                ]
+                if replace_extra_special_tokens and new_tokens:
+                    self._extra_special_tokens = list(new_tokens)
                 else:
-                    self._special_tokens_map["additional_special_tokens"].extend(to_add)
-                added_tokens += to_add
-
+                    self._extra_special_tokens.extend(new_tokens)
+                tokens_to_add.extend(new_tokens)
             else:
                 if not isinstance(value, (str, AddedToken)):
                     raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance")
-                if isinstance(value, (str)):
-                    # for legacy purpose we default to stripping. `False` depends on this
+                if isinstance(value, str):
                     value = AddedToken(value, rstrip=False, lstrip=False, normalized=False, special=True)
-                if isinstance(value, AddedToken):
-                    setattr(self, key, value)
-                if value not in added_tokens:
-                    added_tokens.append(value)
+                setattr(self, key, value)
+                tokens_to_add.append(value)
 
-        # if we are adding tokens that were not part of the vocab, we ought to add them
-        added_tokens = self.add_tokens(added_tokens, special_tokens=True)
-        return added_tokens
+        return self.add_tokens(tokens_to_add, special_tokens=True)
 
     def add_tokens(
         self, new_tokens: Union[str, AddedToken, Sequence[Union[str, AddedToken]]], special_tokens: bool = False
     ) -> int:
         """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
-        it with indices starting from length of the current vocabulary and will be isolated before the tokenization
-        algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore
-        not treated in the same way.
+        #TODO remove this from here! PreTrainedTOkeniuzerBase should be agnostic of AddedToken.
 
-        Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix
-        of the model so that its embedding matrix matches the tokenizer.
-
-        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
+        Add a list of new tokens. If the new tokens are not in the vocabulary, they are added to the end. Added tokens and
+        tokens from the vocabulary of the tokenization algorithm are therefore not treated in the same way.
 
         Args:
             new_tokens (`str`, `tokenizers.AddedToken` or a sequence of *str* or `tokenizers.AddedToken`):
@@ -1007,9 +1221,7 @@ def add_tokens(
                 whether this token should strip all potential whitespaces on the left side, whether this token should
                 strip all potential whitespaces on the right side, etc.
             special_tokens (`bool`, *optional*, defaults to `False`):
-                Can be used to specify if the token is a special token. This mostly change the normalization behavior
-                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
-
+                Specifies if the token is special. This mostly changes the normalization behavior
                 See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
 
         Returns:
@@ -1032,7 +1244,6 @@ def add_tokens(
 
         if not isinstance(new_tokens, (list, tuple)):
             new_tokens = [new_tokens]
-
         return self._add_tokens(new_tokens, special_tokens=special_tokens)
 
     def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_tokens: bool = False) -> int:
@@ -1040,9 +1251,6 @@ def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_to
 
     @property
     def pad_token_type_id(self) -> int:
-        """
-        `int`: Id of the padding token type in the vocabulary.
-        """
         return self._pad_token_type_id
 
     def __setattr__(self, key, value):
@@ -1051,21 +1259,36 @@ def __setattr__(self, key, value):
         if key_is_special_id:
             key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
 
-        if self.__dict__.get("_special_tokens_map", None) is not None and any(
-            name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id]
+        # Check if this is a named special token
+        if (
+            self.__dict__.get("_special_tokens_map", None) is not None
+            and key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES
         ):
             if key_is_special_id:
                 if value is not None:
-                    value = (
-                        self.convert_ids_to_tokens(value)
-                        if key != "additional_special_tokens"
-                        else [self.convert_ids_to_tokens(val) for val in value]
-                    )
+                    value = self.convert_ids_to_tokens(value)
                 key = key_without_id
 
-            if key != "additional_special_tokens" and not isinstance(value, (str, AddedToken)) and value is not None:
+            if not isinstance(value, (str, AddedToken)) and value is not None:
                 raise ValueError(f"Cannot set a non-string value as the {key}")
             self._special_tokens_map[key] = value
+        # Check if this is extra_special_tokens or extra_special_tokens_ids
+        elif self.__dict__.get("_extra_special_tokens", None) is not None and key_without_id == "extra_special_tokens":
+            if key_is_special_id:
+                if value is not None:
+                    value = [self.convert_ids_to_tokens(val) for val in value]
+                key = key_without_id
+
+            if key == "extra_special_tokens":
+                if value is None:
+                    self._extra_special_tokens = []
+                elif isinstance(value, dict):
+                    # Dict is treated as model-specific special tokens (such as multimodal tokens)
+                    self._set_model_specific_special_tokens(special_tokens=value)
+                elif isinstance(value, (list, tuple)):
+                    self._extra_special_tokens = list(value)
+                else:
+                    raise ValueError(f"extra_special_tokens must be a list, tuple, or dict, got {type(value)}")
         else:
             super().__setattr__(key, value)
 
@@ -1075,86 +1298,119 @@ def __getattr__(self, key):
         if key_is_special_id:
             key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
 
-        if self.__dict__.get("_special_tokens_map", None) is not None and any(
-            name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id]
+        # Check if this is a named special token
+        if (
+            self.__dict__.get("_special_tokens_map", None) is not None
+            and key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES
         ):
             _special_tokens_map = self.__dict__["_special_tokens_map"]
             if not key_is_special_id:
-                if _special_tokens_map[key] is None:
+                if _special_tokens_map[key_without_id] is None:
                     if self.verbose:
                         logger.error(f"Using {key}, but it is not set yet.")
                     return None
-                value = _special_tokens_map[key]
-                return str(value) if key != "additional_special_tokens" else [str(tok) for tok in value]
+                value = _special_tokens_map[key_without_id]
+                return str(value)
             else:
                 attr_as_tokens = getattr(self, key_without_id)
                 return self.convert_tokens_to_ids(attr_as_tokens) if attr_as_tokens is not None else None
 
+        # Check if this is extra_special_tokens or extra_special_tokens_ids
+        elif key_without_id == "extra_special_tokens":
+            if self.__dict__.get("_extra_special_tokens", None) is not None:
+                if not key_is_special_id:
+                    return [str(tok) for tok in self.__dict__["_extra_special_tokens"]]
+                else:
+                    # extra_special_tokens_ids
+                    tokens = self.__dict__["_extra_special_tokens"]
+                    return self.convert_tokens_to_ids([str(tok) for tok in tokens]) if tokens else []
+
         if key not in self.__dict__:
             raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
         else:
             return super().__getattr__(key)
 
-    @property
-    def special_tokens_map(self) -> dict[str, Union[str, list[str]]]:
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
         """
-        `dict[str, Union[str, list[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
-        `unk_token`, etc.) to their values (`''`, `''`, etc.).
+        Retrieve sequence ids from a token list that has no special tokens added.
 
-        Convert potential tokens of `tokenizers.AddedToken` type to string.
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
+        For fast tokenizers, data collators call this with `already_has_special_tokens=True` to build a mask over an
+        already-formatted sequence. In that case, we compute the mask by checking membership in `all_special_ids`.
 
-    @property
-    def special_tokens_map_extended(self) -> dict[str, Union[str, AddedToken, list[Union[str, AddedToken]]]]:
-        """
-        `dict[str, Union[str, tokenizers.AddedToken, list[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping
-        special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`''`, `''`, etc.).
+        Args:
+            token_ids_0: List of IDs for the (possibly already formatted) sequence.
+            token_ids_1: Unused when `already_has_special_tokens=True`. Must be None in that case.
+            already_has_special_tokens: Whether the sequence is already formatted with special tokens.
 
-        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
-        special tokens are tokenized.
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = self._special_tokens_map[attr]
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of ids is already formatted "
+                    "with special tokens for the model."
+                )
+            special_ids = set(self.all_special_ids)
+            return [1 if int(tid) in special_ids else 0 for tid in token_ids_0]
+
+        # Default base implementation for non-formatted sequences is not provided here.
+        # Concrete tokenizer classes should override this for their specific formatting rules.
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement get_special_tokens_mask for non-formatted sequences"
+        )
 
     @property
-    def all_special_tokens_extended(self) -> list[Union[str, AddedToken]]:
+    def special_tokens_map(self) -> dict[str, str]:
         """
-        `list[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.), the order has
-        nothing to do with the index of each tokens. If you want to know the correct indices, check
-        `self.added_tokens_encoder`. We can't create an order anymore as the keys are `AddedTokens` and not `Strings`.
+        `dict[str, str]`: A flat dictionary mapping named special token attributes to their string values.
+
+        Only includes the standard named special tokens (bos_token, eos_token, etc.), not extra_special_tokens.
+        This provides a clean, flat structure without mixed types.
+
+        Returns:
+            A dictionary with keys like 'bos_token', 'eos_token', etc., and string values.
 
-        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how
-        special tokens are tokenized.
+        **V5 Change**: This now returns only named tokens. Use `extra_special_tokens` for the additional tokens.
         """
-        all_tokens = []
-        seen = set()
-        for value in self.special_tokens_map_extended.values():
-            if isinstance(value, (list, tuple)):
-                tokens_to_add = [token for token in value if str(token) not in seen]
-            else:
-                tokens_to_add = [value] if str(value) not in seen else []
-            seen.update(map(str, tokens_to_add))
-            all_tokens.extend(tokens_to_add)
-        return all_tokens
+        return {
+            attr: str(self._special_tokens_map[attr])
+            for attr in self.SPECIAL_TOKENS_ATTRIBUTES
+            if self._special_tokens_map.get(attr) is not None
+        }
+
+    # Note: extra_special_tokens and extra_special_tokens_ids are handled by __getattr__ and __setattr__
+    # We don't define them as @property to keep the implementation simpler
 
     @property
     def all_special_tokens(self) -> list[str]:
         """
-        `list[str]`: A list of the unique special tokens (`''`, `''`, ..., etc.).
+        `list[str]`: A list of all unique special tokens (named + extra) as strings.
 
-        Convert tokens of `tokenizers.AddedToken` type to string.
+        Includes both named special tokens (bos_token, eos_token, etc.) and extra special tokens.
+        Converts tokens of `tokenizers.AddedToken` type to string.
         """
-        all_toks = [str(s) for s in self.all_special_tokens_extended]
+        seen = set()
+        all_toks = []
+
+        # Add named special tokens
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            value = self._special_tokens_map.get(attr)
+            if value is not None:
+                token_str = str(value)
+                if token_str not in seen:
+                    all_toks.append(token_str)
+                    seen.add(token_str)
+
+        # Add extra special tokens
+        for token in self._extra_special_tokens:
+            token_str = str(token)
+            if token_str not in seen:
+                all_toks.append(token_str)
+                seen.add(token_str)
+
         return all_toks
 
     @property
@@ -1162,16 +1418,17 @@ def all_special_ids(self) -> list[int]:
         """
         `list[int]`: List the ids of the special tokens(`''`, `''`, etc.) mapped to class attributes.
         """
-        all_toks = self.all_special_tokens
-        all_ids = self.convert_tokens_to_ids(all_toks)
-        return all_ids
+        return self.convert_tokens_to_ids(self.all_special_tokens)
 
-    def _set_model_specific_special_tokens(self, special_tokens: list[str]):
+    def _set_model_specific_special_tokens(self, special_tokens: dict[str, Union[str, AddedToken]]):
         """
-        Adds new special tokens to the "SPECIAL_TOKENS_ATTRIBUTES" list which will be part
-        of "self.special_tokens" and saved as a special token in tokenizer's config.
-        This allows us to dynamically add new model-type specific tokens after initializing the tokenizer.
-        For example: if the model tokenizers is multimodal, we can support special image or audio tokens.
+        Adds new model-specific special tokens (e.g., for multimodal models).
+
+        These tokens are added to the named special tokens map and will be saved in tokenizer config.
+        For example: if the model tokenizer is multimodal, we can support special image or audio tokens.
+
+        Args:
+            special_tokens: Dictionary of {token_name: token_value}
         """
         self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys())
         for key, value in special_tokens.items():
@@ -1180,745 +1437,96 @@ def _set_model_specific_special_tokens(self, special_tokens: list[str]):
             else:
                 raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
 
+    @property
+    def added_tokens_decoder(self) -> dict[int, AddedToken]:
+        raise NotImplementedError()
 
-ENCODE_KWARGS_DOCSTRING = r"""
-            add_special_tokens (`bool`, *optional*, defaults to `True`):
-                Whether or not to add special tokens when encoding the sequences. This will use the underlying
-                `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are
-                automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens
-                automatically.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Activates and controls padding. Accepts the following values:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence is provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
-                Activates and controls truncation. Accepts the following values:
-
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  sequences (or a batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  greater than the model maximum admissible input size).
-            max_length (`int`, *optional*):
-                Controls the maximum length to use by one of the truncation/padding parameters.
-
-                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
-                is required by one of the truncation/padding parameters. If the model has no specific maximum input
-                length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            stride (`int`, *optional*, defaults to 0):
-                If set to a number along with `max_length`, the overflowing tokens returned when
-                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
-                returned to provide some overlap between truncated and overflowing sequences. The value of this
-                argument defines the number of overlapping tokens.
-            is_split_into_words (`bool`, *optional*, defaults to `False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
-                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
-                which it will tokenize. This is useful for NER or token classification.
-            pad_to_multiple_of (`int`, *optional*):
-                If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated.
-                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            padding_side (`str`, *optional*):
-                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-                Default value is picked from the class attribute of the same name.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-"""
+    def __repr__(self) -> str:
+        added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
+        return (
+            f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
+            f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length},"
+            f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
+            f" special_tokens={self.special_tokens_map},"
+            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)"
+        )
 
-ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            return_token_type_ids (`bool`, *optional*):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the `return_outputs` attribute.
+    def __len__(self) -> int:
+        raise NotImplementedError()
 
-                [What are token type IDs?](../glossary#token-type-ids)
-            return_attention_mask (`bool`, *optional*):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+    @property
+    def vocab_size(self) -> int:
+        """
+        `int`: Size of the base vocabulary (without the added tokens).
+        """
+        raise NotImplementedError()
 
-                [What are attention masks?](../glossary#attention-mask)
-            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
-                of returning overflowing tokens.
-            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
-                Whether or not to return special tokens mask information.
-            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
-                Whether or not to return `(char_start, char_end)` for each token.
+    def get_vocab(self) -> dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
 
-                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
-                Python's tokenizer, this method will raise `NotImplementedError`.
-            return_length  (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the lengths of the encoded inputs.
-            verbose (`bool`, *optional*, defaults to `True`):
-                Whether or not to print more information and warnings.
-            **kwargs: passed to the `self.tokenize()` method
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
+        vocab.
 
-        Return:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+        Returns:
+            `dict[str, int]`: The vocabulary.
+        """
+        raise NotImplementedError()
 
-            - **input_ids** -- List of token ids to be fed to a model.
+    def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]:
+        """
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
 
-              [What are input IDs?](../glossary#input-ids)
+        Args:
+            tokens (`str` or `list[str]`): One or several token(s) to convert to token id(s).
 
-            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
-              if *"token_type_ids"* is in `self.model_input_names`).
+        Returns:
+            `int` or `list[int]`: The token id or list of token ids.
+        """
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
 
-              [What are token type IDs?](../glossary#token-type-ids)
+        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
 
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+    def convert_ids_to_tokens(
+        self, ids: Union[int, list[int]], skip_special_tokens: bool = False
+    ) -> Union[str, list[str]]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
 
-              [What are attention masks?](../glossary#attention-mask)
+        Args:
+            ids (`int` or `list[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
 
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
-              `return_overflowing_tokens=True`).
-            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when `return_length=True`)
-"""
+        Returns:
+            `str` or `list[str]`: The decoded token(s).
+        """
+        raise NotImplementedError()
 
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        *init_inputs,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        trust_remote_code=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
+        tokenizer.
 
-INIT_TOKENIZER_DOCSTRING = r"""
-    Class attributes (overridden by derived classes)
-
-        - **vocab_files_names** (`dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each
-          vocabulary file required by the model, and as associated values, the filename for saving the associated file
-          (string).
-        - **pretrained_vocab_files_map** (`dict[str, dict[str, str]]`) -- A dictionary of dictionaries, with the
-          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
-          low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the
-          associated pretrained vocabulary file.
-        - **model_input_names** (`list[str]`) -- A list of inputs expected in the forward pass of the model.
-        - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied.
-          Should be `'right'` or `'left'`.
-        - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation
-          applied. Should be `'right'` or `'left'`.
-
-    Args:
-        model_max_length (`int`, *optional*):
-            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
-            loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the
-            value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will
-            default to VERY_LARGE_INTEGER (`int(1e30)`).
-        padding_side (`str`, *optional*):
-            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
-            Default value is picked from the class attribute of the same name.
-        truncation_side (`str`, *optional*):
-            The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
-            Default value is picked from the class attribute of the same name.
-        chat_template (`str`, *optional*):
-            A Jinja template string that will be used to format lists of chat messages. See
-            https://huggingface.co/docs/transformers/chat_templating for a full description.
-        model_input_names (`list[string]`, *optional*):
-            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
-            `"attention_mask"`). Default value is picked from the class attribute of the same name.
-        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
-            `self.bos_token_id`.
-        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token representing the end of a sentence. Will be associated to `self.eos_token` and
-            `self.eos_token_id`.
-        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
-            `self.unk_token_id`.
-        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token separating two different sentences in the same input (used by BERT for instance). Will be
-            associated to `self.sep_token` and `self.sep_token_id`.
-        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
-            attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`.
-        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token representing the class of the input (used by BERT for instance). Will be associated to
-            `self.cls_token` and `self.cls_token_id`.
-        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
-            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
-            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
-        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
-            A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with
-            `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end
-            of the vocabulary.
-        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
-            tokenization process.
-        split_special_tokens (`bool`, *optional*, defaults to `False`):
-            Whether or not the special tokens should be split during the tokenization process. Passing will affect the
-            internal state of the tokenizer. The default behavior is to not split special tokens. This means that if
-            `` is the `bos_token`, then `tokenizer.tokenize("") = ['`]. Otherwise, if
-            `split_special_tokens=True`, then `tokenizer.tokenize("")` will be give `['<','s', '>']`.
-"""
-
-
-@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
-class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
-    """
-    Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].
-
-    Handles shared (mostly boiler plate) methods for those two classes.
-    """
-
-    vocab_files_names: dict[str, str] = {}
-    pretrained_vocab_files_map: dict[str, dict[str, str]] = {}
-    _auto_class: Optional[str] = None
-
-    # first name has to correspond to main model input name
-    # to make sure `tokenizer.pad(...)` works correctly
-    model_input_names: list[str] = ["input_ids", "token_type_ids", "attention_mask"]
-    padding_side: str = "right"
-    truncation_side: str = "right"
-    slow_tokenizer_class = None
-
-    def __init__(self, **kwargs):
-        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
-        self.init_inputs = ()
-        for key in kwargs:
-            if hasattr(self, key) and callable(getattr(self, key)):
-                raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}")
-
-        self.init_kwargs = copy.deepcopy(kwargs)
-        self.name_or_path = kwargs.pop("name_or_path", "")
-        self._processor_class = kwargs.pop("processor_class", None)
-
-        # For backward compatibility we fallback to set model_max_length from max_len if provided
-        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
-        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
-
-        # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it
-        # is changed.
-        self.padding_side = kwargs.pop("padding_side", self.padding_side)
-        if self.padding_side not in ["right", "left"]:
-            raise ValueError(
-                f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
-            )
-
-        self.truncation_side = kwargs.pop("truncation_side", self.truncation_side)
-        if self.truncation_side not in ["right", "left"]:
-            raise ValueError(
-                f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}"
-            )
-
-        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
-
-        # By default, cleaning tokenization spaces for both fast and slow tokenizers
-        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
-
-        # By default, do not split special tokens for both fast and slow tokenizers
-        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
-
-        self.deprecation_warnings = {}  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
-        self._in_target_context_manager = False
-
-        # Stores a Jinja template that formats chat histories into tokenizable strings
-        self.chat_template = kwargs.pop("chat_template", None)
-        if isinstance(self.chat_template, (list, tuple)):
-            # Chat templates are stored as lists of dicts with fixed key names,
-            # we reconstruct that into a single dict while loading them.
-            self.chat_template = {template["name"]: template["template"] for template in self.chat_template}
-
-        self.response_schema = kwargs.pop("response_schema", None)
-
-        super().__init__(**kwargs)
-
-        self.extra_special_tokens = kwargs.pop("extra_special_tokens", {})
-        self._set_model_specific_special_tokens(special_tokens=self.extra_special_tokens)
-
-    @property
-    def max_len_single_sentence(self) -> int:
-        """
-        `int`: The maximum length of a sentence that can be fed to the model.
-        """
-        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
-
-    @property
-    def max_len_sentences_pair(self) -> int:
-        """
-        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
-        """
-        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
-
-    @max_len_single_sentence.setter
-    def max_len_single_sentence(self, value) -> int:
-        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
-        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
-            if not self.deprecation_warnings.get("max_len_single_sentence", False):
-                logger.warning(
-                    "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
-                )
-            self.deprecation_warnings["max_len_single_sentence"] = True
-        else:
-            raise ValueError(
-                "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
-            )
-
-    @max_len_sentences_pair.setter
-    def max_len_sentences_pair(self, value) -> int:
-        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
-        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
-            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
-                logger.warning(
-                    "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
-                )
-            self.deprecation_warnings["max_len_sentences_pair"] = True
-        else:
-            raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
-
-    def _set_processor_class(self, processor_class: str):
-        """Sets processor class as an attribute."""
-        self._processor_class = processor_class
-
-    @property
-    def added_tokens_decoder(self) -> dict[int, AddedToken]:
-        raise NotImplementedError()
-
-    def __repr__(self) -> str:
-        added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()])
-        return (
-            f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
-            f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
-            f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
-            f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces},"
-            " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)"
-        )
-
-    def __len__(self) -> int:
-        raise NotImplementedError()
-
-    def get_vocab(self) -> dict[str, int]:
-        """
-        Returns the vocabulary as a dictionary of token to index.
-
-        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the
-        vocab.
-
-        Returns:
-            `dict[str, int]`: The vocabulary.
-        """
-        raise NotImplementedError()
-
-    # Case: tokenize=False → returns rendered string
-    @overload
-    def apply_chat_template(
-        self,
-        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
-        tools: Optional[list[Union[dict, Callable]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template: Optional[str] = None,
-        add_generation_prompt: bool = False,
-        continue_final_message: bool = False,
-        tokenize: Literal[False] = False,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: bool = False,
-        max_length: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_dict: bool = False,
-        return_assistant_tokens_mask: bool = False,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        **kwargs,
-    ) -> str: ...
-
-    # Case: tokenize=True, return_dict=False, return_tensors=None → returns ids
-    @overload
-    def apply_chat_template(
-        self,
-        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
-        tools: Optional[list[Union[dict, Callable]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template: Optional[str] = None,
-        add_generation_prompt: bool = False,
-        continue_final_message: bool = False,
-        tokenize: Literal[True] = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: bool = False,
-        max_length: Optional[int] = None,
-        return_tensors: Literal[None] = None,
-        return_dict: Literal[False] = False,
-        return_assistant_tokens_mask: bool = False,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        **kwargs,
-    ) -> Union[list[int], list[list[int]]]: ...
-
-    # Case: tokenize=True, return_dict=True → returns BatchEncoding
-    @overload
-    def apply_chat_template(
-        self,
-        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
-        tools: Optional[list[Union[dict, Callable]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template: Optional[str] = None,
-        add_generation_prompt: bool = False,
-        continue_final_message: bool = False,
-        tokenize: Literal[True] = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: bool = False,
-        max_length: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_dict: Literal[True] = True,
-        return_assistant_tokens_mask: bool = False,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        **kwargs,
-    ) -> BatchEncoding: ...
-
-    def apply_chat_template(
-        self,
-        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
-        tools: Optional[list[Union[dict, Callable]]] = None,
-        documents: Optional[list[dict[str, str]]] = None,
-        chat_template: Optional[str] = None,
-        add_generation_prompt: bool = False,
-        continue_final_message: bool = False,
-        tokenize: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: bool = False,
-        max_length: Optional[int] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_dict: bool = True,
-        return_assistant_tokens_mask: bool = False,
-        tokenizer_kwargs: Optional[dict[str, Any]] = None,
-        **kwargs,
-    ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]:
-        """
-        Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
-        ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
-        determine the format and control tokens to use when converting.
-
-        Args:
-            conversation (Union[list[dict[str, str]], list[list[dict[str, str]]]]): A list of dicts
-                with "role" and "content" keys, representing the chat history so far.
-            tools (`list[Union[Dict, Callable]]`, *optional*):
-                A list of tools (callable functions) that will be accessible to the model. If the template does not
-                support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
-                giving the name, description and argument types for the tool. See our
-                [tool use guide](https://huggingface.co/docs/transformers/en/chat_extras#passing-tools)
-                for more information.
-            documents (`list[dict[str, str]]`, *optional*):
-                A list of dicts representing documents that will be accessible to the model if it is performing RAG
-                (retrieval-augmented generation). If the template does not support RAG, this argument will have no
-                effect. We recommend that each document should be a dict containing "title" and "text" keys.
-            chat_template (`str`, *optional*):
-                A Jinja template to use for this conversion. It is usually not necessary to pass anything to this
-                argument, as the model's template will be used by default.
-            add_generation_prompt (bool, *optional*):
-                If this is set, a prompt with the token(s) that indicate
-                the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
-                Note that this argument will be passed to the chat template, and so it must be supported in the
-                template for this argument to have any effect.
-            continue_final_message (bool, *optional*):
-                If this is set, the chat will be formatted so that the final
-                message in the chat is open-ended, without any EOS tokens. The model will continue this message
-                rather than starting a new one. This allows you to "prefill" part of
-                the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
-            tokenize (`bool`, defaults to `True`):
-                Whether to tokenize the output. If `False`, the output will be a string.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                 index) among:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            truncation (`bool`, defaults to `False`):
-                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
-            max_length (`int`, *optional*):
-                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
-                not specified, the tokenizer's `max_length` attribute will be used as a default.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
-                values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-            return_dict (`bool`, defaults to `False`):
-                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
-            tokenizer_kwargs (`dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer.
-            return_assistant_tokens_mask (`bool`, defaults to `False`):
-                Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
-                the mask will contain 1. For user and system tokens, the mask will contain 0.
-                This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
-            **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template.
-
-        Returns:
-            `Union[list[int], Dict]`: A list of token ids representing the tokenized chat so far, including control tokens. This
-            output is ready to pass to the model, either directly or via methods like `generate()`. If `return_dict` is
-            set, will return a dict of tokenizer outputs instead.
-        """
-
-        if not tokenize:
-            return_dict = False  # dicts are only returned by the tokenizer anyway
-
-        if return_assistant_tokens_mask and not (return_dict and tokenize):
-            raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`")
-
-        if tokenizer_kwargs is None:
-            tokenizer_kwargs = {}
-
-        chat_template = self.get_chat_template(chat_template, tools)
-
-        if isinstance(conversation, (list, tuple)) and (
-            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
-        ):
-            conversations = conversation
-            is_batched = True
-        else:
-            conversations = [conversation]
-            is_batched = False
-
-        if continue_final_message:
-            if add_generation_prompt:
-                raise ValueError(
-                    "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
-                )
-            if return_assistant_tokens_mask:
-                raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
-
-        template_kwargs = {**self.special_tokens_map, **kwargs}  # kwargs overwrite special tokens if both are present
-        rendered_chat, generation_indices = render_jinja_template(
-            conversations=conversations,
-            tools=tools,
-            documents=documents,
-            chat_template=chat_template,
-            return_assistant_tokens_mask=return_assistant_tokens_mask,
-            continue_final_message=continue_final_message,
-            add_generation_prompt=add_generation_prompt,
-            **template_kwargs,
-        )
-
-        if not is_batched:
-            rendered_chat = rendered_chat[0]
-
-        if tokenize:
-            out = self(
-                rendered_chat,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                add_special_tokens=False,
-                return_tensors=return_tensors,
-                **tokenizer_kwargs,
-            )
-            if return_dict:
-                if return_assistant_tokens_mask:
-                    assistant_masks = []
-                    if is_batched or return_tensors:
-                        input_ids = out["input_ids"]
-                    else:
-                        input_ids = [out["input_ids"]]
-                    for i in range(len(input_ids)):
-                        current_mask = [0] * len(input_ids[i])
-                        for assistant_start_char, assistant_end_char in generation_indices[i]:
-                            start_token = out.char_to_token(i, assistant_start_char)
-                            end_token = out.char_to_token(i, assistant_end_char - 1)
-                            if start_token is None:
-                                # start_token is out of bounds maybe due to truncation.
-                                break
-                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
-                                current_mask[token_id] = 1
-                        assistant_masks.append(current_mask)
-
-                    if not is_batched and not return_tensors:
-                        assistant_masks = assistant_masks[0]
-
-                    out["assistant_masks"] = assistant_masks
-
-                    if return_tensors:
-                        out.convert_to_tensors(tensor_type=return_tensors)
-
-                return out
-            else:
-                return out["input_ids"]
-        else:
-            return rendered_chat
-
-    def encode_message_with_chat_template(
-        self,
-        message: dict[str, str],
-        conversation_history: Optional[list[dict[str, str]]] = None,
-        **kwargs,
-    ) -> list[int]:
-        """
-        Tokenize a single message. This method is a convenience wrapper around `apply_chat_template` that allows you
-        to tokenize messages one by one. This is useful for things like token-by-token streaming.
-        This method is not guaranteed to be perfect. For some models, it may be impossible to robustly tokenize
-        single messages. For example, if the chat template adds tokens after each message, but also has a prefix that
-        is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token.
-        In these cases, this method will do its best to find the correct tokenization, but it may not be perfect.
-        **Note:** This method does not support `add_generation_prompt`. If you want to add a generation prompt,
-        you should do it separately after tokenizing the conversation.
-        Args:
-            message (`dict`):
-                A dictionary with "role" and "content" keys, representing the message to tokenize.
-            conversation_history (`list[dict]`, *optional*):
-                A list of dicts with "role" and "content" keys, representing the chat history so far. If you are
-                tokenizing messages one by one, you should pass the previous messages in the conversation here.
-            **kwargs:
-                Additional kwargs to pass to the `apply_chat_template` method.
-        Returns:
-            `list[int]`: A list of token ids representing the tokenized message.
-        """
-        if "add_generation_prompt" in kwargs:
-            raise ValueError(
-                "`encode_message_with_chat_template` does not support `add_generation_prompt`. Please add the generation prompt "
-                "separately."
-            )
-
-        if conversation_history is None or len(conversation_history) == 0:
-            return self.apply_chat_template(
-                [message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
-            )
-
-        conversation = conversation_history + [message]
-        tokens = self.apply_chat_template(
-            conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
-        )
-
-        prefix_tokens = self.apply_chat_template(
-            conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
-        )
-        # It's possible that the prefix tokens are not a prefix of the full list of tokens.
-        # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`.
-        # In this case, we can't simply find the prefix, so we have to do something a bit more subtle.
-        # We look for the first place where the tokens differ, and that's our split point.
-        # This is not perfect, but it's the best we can do without a token-level API.
-        # To make this more robust, we could do a diff and find the longest common subsequence, but this is
-        # a good first approximation.
-        # This is particularly important for models like Llama3 that have changed their chat template to include
-        # EOS tokens after user messages.
-        min_len = min(len(prefix_tokens), len(tokens))
-        for i in range(min_len):
-            if prefix_tokens[i] != tokens[i]:
-                return tokens[i:]
-        return tokens[min_len:]
-
-    def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[list[dict]] = None) -> str:
-        """
-        Retrieve the chat template string used for tokenizing chat messages. This template is used
-        internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
-        template for better generation tracking.
-
-        Args:
-            chat_template (`str`, *optional*):
-                A Jinja template or the name of a template to use for this conversion.
-                It is usually not necessary to pass anything to this argument,
-                as the model's template will be used by default.
-            tools (`list[Dict]`, *optional*):
-                A list of tools (callable functions) that will be accessible to the model. If the template does not
-                support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
-                giving the name, description and argument types for the tool. See our
-                [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
-                for more information.
-
-        Returns:
-            `str`: The chat template string.
-        """
-        # First, handle the cases when the model has a dict of multiple templates
-        if isinstance(self.chat_template, dict):
-            template_dict = self.chat_template
-            if chat_template is not None and chat_template in template_dict:
-                # The user can pass the name of a template to the chat template argument instead of an entire template
-                chat_template = template_dict[chat_template]
-            elif chat_template is None:
-                if tools is not None and "tool_use" in template_dict:
-                    chat_template = template_dict["tool_use"]
-                elif "default" in template_dict:
-                    chat_template = template_dict["default"]
-                else:
-                    raise ValueError(
-                        "This model has multiple chat templates with no default specified! Please either pass a chat "
-                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
-                        f"template names are {sorted(template_dict.keys())}."
-                    )
-
-        elif chat_template is None:
-            # These are the cases when the model has a single template
-            # priority: `chat_template` argument > `tokenizer.chat_template`
-            if self.chat_template is not None:
-                chat_template = self.chat_template
-            else:
-                raise ValueError(
-                    "Cannot use chat template functions because tokenizer.chat_template is not set and no template "
-                    "argument was passed! For information about writing templates and setting the "
-                    "tokenizer.chat_template attribute, please see the documentation at "
-                    "https://huggingface.co/docs/transformers/main/en/chat_templating"
-                )
-
-        return chat_template
-
-    def parse_response(
-        self,
-        response: str | list[str | int | list[int]] | np.ndarray | torch.Tensor,
-        schema: list | dict | None = None,
-    ):
-        """
-        Converts an output string created by generating text from a model into a parsed message dictionary.
-        This method is intended for use with chat models, and will read the tokenizer's `response_schema` attribute to
-        control parsing, although this can be overridden by passing a `response_schema` argument directly.
-
-        For more information, see the
-        [response parsing](https://huggingface.co/docs/transformers/main/en/chat_response_parsing) documentation.
-
-        Args:
-            response (`str`):
-                The output string generated by the model. This can be either a decoded string or list of strings,
-                or token IDs as a list/array.
-            schema (`Union[list, dict]`, *optional*):
-                A response schema that indicates the expected output format and how parsing should be performed.
-                If not provided, the tokenizer's `response_schema` attribute will be used.
-        """
-        batched = (
-            (isinstance(response, list) and not isinstance(response[0], int))
-            or getattr(response, "ndim", 0) > 1  # For torch/numpy tensors
-        )
-
-        if schema is None:
-            if getattr(self, "response_schema", None) is None:
-                raise AttributeError("This tokenizer does not have a `response_schema` for parsing chat responses!")
-            schema = self.response_schema
-        if batched:
-            if not (isinstance(response, list) and isinstance(response[0], str)):
-                response = self.batch_decode(response)
-            return [recursive_parse(single_response, schema) for single_response in response]
-        else:
-            if not isinstance(response, str):
-                response = self.decode(response)
-            return recursive_parse(response, schema)
-
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        *init_inputs,
-        cache_dir: Optional[Union[str, os.PathLike]] = None,
-        force_download: bool = False,
-        local_files_only: bool = False,
-        token: Optional[Union[str, bool]] = None,
-        revision: str = "main",
-        trust_remote_code=False,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined
-        tokenizer.
-
-        Args:
-            pretrained_model_name_or_path (`str` or `os.PathLike`):
-                Can be either:
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                Can be either:
 
                 - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                 - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
@@ -1957,7 +1565,7 @@ def from_pretrained(
             kwargs (additional keyword arguments, *optional*):
                 Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`,
                 `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
-                `additional_special_tokens`. See parameters in the `__init__` for more details.
+                `extra_special_tokens`. See parameters in the `__init__` for more details.
 
         
 
@@ -1994,7 +1602,7 @@ def from_pretrained(
         commit_hash = kwargs.pop("_commit_hash", None)
         gguf_file = kwargs.get("gguf_file")
 
-        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
+        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class}
         if from_pipeline is not None:
             user_agent["using_pipeline"] = from_pipeline
 
@@ -2008,19 +1616,39 @@ def from_pretrained(
 
         is_local = os.path.isdir(pretrained_model_name_or_path)
         single_file_id = None
+        if os.path.isfile(pretrained_model_name_or_path):
+            # For legacy support: allow single-file loading if:
+            # 1. Only one vocab file is required, OR
+            # 2. It's a fast tokenizer with tokenizer_file (which is optional), OR
+            # 3. It's a GGUF file
+            vocab_files_count = len(cls.vocab_files_names)
+            has_optional_tokenizer_file = vocab_files_count > 1 and "tokenizer_file" in cls.vocab_files_names
+
+            if vocab_files_count > 1 and not gguf_file and not has_optional_tokenizer_file:
+                raise ValueError(
+                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
+                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
+                )
+            # Use first vocab file that's not tokenizer_file
+            file_id = list(cls.vocab_files_names.keys())[0]
+            if file_id == "tokenizer_file" and vocab_files_count > 1:
+                file_id = [k for k in cls.vocab_files_names.keys() if k != "tokenizer_file"][0]
 
-        if gguf_file:
-            vocab_files["vocab_file"] = gguf_file
+            vocab_files[file_id] = pretrained_model_name_or_path
+            single_file_id = file_id
         else:
-            # At this point pretrained_model_name_or_path is either a directory or a model identifier name
-            additional_files_names = {
-                "added_tokens_file": ADDED_TOKENS_FILE,  # kept only for legacy
-                "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,  # kept only for legacy
-                "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
-                "tokenizer_file": FULL_TOKENIZER_FILE,
-                "chat_template_file": CHAT_TEMPLATE_FILE,
-            }
+            if gguf_file:
+                vocab_files["vocab_file"] = gguf_file
+            else:
+                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
+                additional_files_names = {
+                    "added_tokens_file": ADDED_TOKENS_FILE,  # kept only for legacy
+                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,  # kept only for legacy
+                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+                    # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
+                    "tokenizer_file": FULL_TOKENIZER_FILE,
+                    "chat_template_file": CHAT_TEMPLATE_FILE,
+                }
 
             vocab_files = {**cls.vocab_files_names, **additional_files_names}
             if "tokenizer_file" in vocab_files:
@@ -2203,7 +1831,7 @@ def _from_pretrained(
             config_tokenizer_class = init_kwargs.get("tokenizer_class")
             init_kwargs.pop("tokenizer_class", None)
             if not has_tokenizer_file:
-                init_kwargs.pop("tokenizer_file", None)
+                init_kwargs.get("tokenizer_file", None)
             saved_init_inputs = init_kwargs.pop("init_inputs", ())
             if not init_inputs:
                 init_inputs = saved_init_inputs
@@ -2223,7 +1851,7 @@ def _from_pretrained(
             if template_file is None:
                 continue  # I think this should never happen, but just in case
             template_name = extra_chat_template.removeprefix("chat_template_")
-            with open(template_file, encoding="utf8") as chat_template_handle:
+            with open(template_file) as chat_template_handle:
                 chat_templates[template_name] = chat_template_handle.read()
         if len(chat_templates) == 1 and "default" in chat_templates:
             init_kwargs["chat_template"] = chat_templates["default"]
@@ -2273,11 +1901,7 @@ def _from_pretrained(
                             break
 
                 if model_type is not None:
-                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
-                        model_type, (None, None)
-                    )
-                    if config_tokenizer_class is None:
-                        config_tokenizer_class = config_tokenizer_class_fast
+                    config_tokenizer_class = TOKENIZER_MAPPING_NAMES.get(model_type)
 
         if config_tokenizer_class is not None:
             if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
@@ -2288,20 +1912,65 @@ def _from_pretrained(
                     f" from is '{cls.__name__}'."
                 )
 
+        # Preserve extra_special_tokens from tokenizer_config.json before updating with kwargs
+        # extra_special_tokens should be a list (user-defined extra tokens)
+        extra_special_tokens_from_config = init_kwargs.get("extra_special_tokens")
+        if isinstance(extra_special_tokens_from_config, (list, tuple)):
+            extra_special_tokens_from_config = list(extra_special_tokens_from_config)
+        else:
+            extra_special_tokens_from_config = None
+
         # Update with newly provided kwargs
         init_kwargs.update(kwargs)
 
+        # V5: Backward compatibility - convert old "additional_special_tokens" to "extra_special_tokens"
+        if "additional_special_tokens" in init_kwargs and "extra_special_tokens" not in init_kwargs:
+            init_kwargs["extra_special_tokens"] = init_kwargs.pop("additional_special_tokens")
+        # Restore extra_special_tokens from config if kwargs overwrote it or it's missing
+        elif extra_special_tokens_from_config is not None:
+            if "extra_special_tokens" not in init_kwargs or not isinstance(
+                init_kwargs.get("extra_special_tokens"), (list, tuple)
+            ):
+                init_kwargs["extra_special_tokens"] = extra_special_tokens_from_config
+
+        # V5: Get model-specific special tokens from config (saved as individual keys in special_tokens_map)
+        # These need to be grouped as extra_special_tokens dict so __init__ can save them to attributes
+        if "extra_special_tokens" not in init_kwargs or not isinstance(init_kwargs.get("extra_special_tokens"), dict):
+            default_attrs = set(cls.SPECIAL_TOKENS_ATTRIBUTES)
+            model_specific_tokens = {
+                key: init_kwargs.pop(key)
+                for key in list(init_kwargs.keys())
+                if key not in default_attrs
+                and key.endswith("_token")
+                and isinstance(init_kwargs[key], (str, AddedToken))
+            }
+            if model_specific_tokens:
+                # If extra_special_tokens is already a list, we need to preserve it
+                if "extra_special_tokens" in init_kwargs and isinstance(
+                    init_kwargs["extra_special_tokens"], (list, tuple)
+                ):
+                    # Keep the list as is, but also add model-specific tokens as a separate dict
+                    # Convert to model_specific_special_tokens so __init__ handles it
+                    init_kwargs["model_specific_special_tokens"] = model_specific_tokens
+                else:
+                    init_kwargs["extra_special_tokens"] = model_specific_tokens
+        elif isinstance(init_kwargs.get("extra_special_tokens"), dict):
+            # If extra_special_tokens is already a dict, convert it to model_specific_special_tokens
+            # so __init__ handles it properly
+            init_kwargs["model_specific_special_tokens"] = init_kwargs.pop("extra_special_tokens")
+
         # Merge resolved_vocab_files arguments in init_kwargs.
         added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
         special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
         for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in init_kwargs:
+            if args_name not in init_kwargs or init_kwargs[args_name] is None:
                 init_kwargs[args_name] = file_path
-        tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)
+        tokenizer_file = resolved_vocab_files.get("tokenizer_file", None)
 
         if slow_tokenizer is not None:
             init_kwargs["__slow_tokenizer"] = slow_tokenizer
         init_kwargs["name_or_path"] = pretrained_model_name_or_path
+        init_kwargs["is_local"] = _is_local
 
         #### Handle tokenizer serialization of added and special tokens
         added_tokens_decoder: dict[int, AddedToken] = {}
@@ -2323,35 +1992,68 @@ def _from_pretrained(
             if special_tokens_map_file is not None:
                 with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
                     special_tokens_map = json.load(special_tokens_map_handle)
+                    # Preserve extra_special_tokens from tokenizer_config.json before processing special_tokens_map
+                    extra_special_tokens_before_map = init_kwargs.get("extra_special_tokens")
+                    if isinstance(extra_special_tokens_before_map, (list, tuple)):
+                        extra_special_tokens_before_map = list(extra_special_tokens_before_map)
+                    else:
+                        extra_special_tokens_before_map = None
+
                     for key, value in special_tokens_map.items():
                         if key in kwargs and kwargs[key]:
                             # This value has already been redefined by the kwargs
                             # We keep this new value and ignore the one stored in the special_tokens_map_file
                             continue
+                        # V5: Convert dict-format tokens to AddedToken
                         if isinstance(value, dict):
                             value["special"] = True
                             value = AddedToken(**value)
-                        elif key == "additional_special_tokens" and isinstance(value, list):
-                            additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
-                            for token in value:
-                                if isinstance(token, dict):
-                                    token["special"] = True
-                                    token = AddedToken(**token)
-                                if token not in additional_special_tokens:
-                                    additional_special_tokens.append(token)
-                            value = additional_special_tokens
+                        elif key == "extra_special_tokens":
+                            # Handle extra_special_tokens from special_tokens_map.json
+                            if isinstance(value, dict):
+                                # Dict format for model-specific tokens - keep as is
+                                init_kwargs[key] = value
+                                continue
+                            elif isinstance(value, list):
+                                # List format - merge with existing if present
+                                existing = init_kwargs.pop("extra_special_tokens", []) or []
+                                if not isinstance(existing, (list, tuple)):
+                                    existing = []
+                                for token in value:
+                                    if isinstance(token, dict):
+                                        token = AddedToken(**token, special=True)
+                                    if token not in existing:
+                                        existing.append(token)
+                                init_kwargs[key] = existing
+                                continue
                         init_kwargs[key] = value
 
+                    # Restore extra_special_tokens from tokenizer_config.json if not in special_tokens_map.json
+                    if (
+                        "extra_special_tokens" not in special_tokens_map
+                        and extra_special_tokens_before_map is not None
+                    ):
+                        if "extra_special_tokens" not in init_kwargs or not isinstance(
+                            init_kwargs.get("extra_special_tokens"), (list, tuple)
+                        ):
+                            init_kwargs["extra_special_tokens"] = extra_special_tokens_before_map
+
+                    # Convert extra_special_tokens dict to model_specific_special_tokens if it's a dict
+                    if isinstance(init_kwargs.get("extra_special_tokens"), dict):
+                        init_kwargs["model_specific_special_tokens"] = init_kwargs.pop("extra_special_tokens")
+
             # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
             # this is for legacy purpose. We don't add the tokens after init for efficiency.
             if added_tokens_file is not None:
                 special_tokens = []
-                for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
-                    if init_kwargs[key] is not None:
-                        if key == "additional_special_tokens":
-                            special_tokens += [str(token) for token in init_kwargs[key]]
-                        else:
-                            special_tokens.append(str(init_kwargs[key]))
+                # V5: Check both named and extra special tokens
+                for key in cls.SPECIAL_TOKENS_ATTRIBUTES:
+                    if key in init_kwargs and init_kwargs[key] is not None:
+                        special_tokens.append(str(init_kwargs[key]))
+
+                # Handle extra_special_tokens
+                if "extra_special_tokens" in init_kwargs and init_kwargs["extra_special_tokens"] is not None:
+                    special_tokens += [str(token) for token in init_kwargs["extra_special_tokens"]]
 
                 with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
                     added_tok_encoder = json.load(added_tokens_handle)
@@ -2380,166 +2082,164 @@ def _from_pretrained(
         # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens
         init_kwargs["added_tokens_decoder"] = added_tokens_decoder
         init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
-        for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
-            if added_tokens_map != {} and init_kwargs[key] is not None:
-                if key != "additional_special_tokens":
-                    init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
+        # V5: Map special tokens from added_tokens_map (named tokens only)
+        for key in cls.SPECIAL_TOKENS_ATTRIBUTES:
+            if key in init_kwargs and added_tokens_map != {} and init_kwargs[key] is not None:
+                init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
+
+        # Track which files were loaded (if not already set by AutoTokenizer)
+        if "files_loaded" not in init_kwargs:
+            files_loaded = []
+            # Check which files this tokenizer class actually uses based on vocab_files_names
+            tokenizer_needs_files = set(cls.vocab_files_names.keys()) if hasattr(cls, "vocab_files_names") else set()
+
+            # If tokenizer_file is in the class's vocab_files_names and exists, prioritize it (TokenizersBackend)
+            if "tokenizer_file" in tokenizer_needs_files and resolved_vocab_files.get("tokenizer_file"):
+                files_loaded.append(os.path.basename(resolved_vocab_files["tokenizer_file"]))
+            else:
+                # Otherwise, add the actual vocab files that were used by this tokenizer class
+                for file_key, file_path in resolved_vocab_files.items():
+                    if (
+                        file_path
+                        and file_key not in ["tokenizer_config_file", "special_tokens_map_file", "added_tokens_file"]
+                        and file_key in tokenizer_needs_files
+                    ):
+                        # Extract just the filename from the path
+                        files_loaded.append(os.path.basename(file_path))
+            init_kwargs["files_loaded"] = files_loaded
 
         # Instantiate the tokenizer.
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
         except import_protobuf_decode_error():
-            logger.info(
+            raise RuntimeError(
                 "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
                 "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
             )
-            return False
         except RuntimeError as e:
             if "sentencepiece_processor.cc" in str(e):
-                logger.info(
+                raise RuntimeError(
                     "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
                     "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).",
-                )
-            return False
+                ) from e
+            else:
+                raise e
         except OSError:
             raise OSError(
                 "Unable to load vocabulary from file. "
                 "Please check that the provided vocabulary is accessible and not corrupted."
             )
 
-        if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
-            logger.info(
-                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
-                " fine-tuned or trained."
-            )
-        try:
-            vocab_size = tokenizer.vocab_size
-        except NotImplementedError:
-            vocab_size = 0
+        # If tokenizer_file exists and tokenizer has a TokenizersBackend, replace the blank tokenizer with tokenizer.json
+        if tokenizer_file is not None and hasattr(tokenizer, "_tokenizer"):
+            from tokenizers import Tokenizer as TokenizerFast
 
-        # Optionally patches mistral tokenizers with wrong regex
-        if (
-            vocab_size > 100000
-            and hasattr(tokenizer, "_tokenizer")
-            and getattr(tokenizer._tokenizer, "pre_tokenizer", None) is not None
-        ):
-            tokenizer = cls._patch_mistral_regex(
-                tokenizer,
+            tokenizer._tokenizer = TokenizerFast.from_file(tokenizer_file)
+            # Re-run post-initialization if the tokenizer has it
+            if hasattr(tokenizer, "_post_init"):
+                tokenizer._post_init()
+            # If only SPM exists, try to get vocab and merges and init to load a tokenizers-backend
+        else:
+            spm_filename = find_sentencepiece_model_file(
                 pretrained_model_name_or_path,
-                token=token,
-                cache_dir=cache_dir,
-                local_files_only=local_files_only,
-                _commit_hash=_commit_hash,
-                _is_local=_is_local,
-                init_kwargs=init_kwargs,
-                fix_mistral_regex=kwargs.get("fix_mistral_regex"),
+                revision=kwargs.get("revision"),
+                token=kwargs.get("token"),
+                cache_dir=kwargs.get("cache_dir"),
+                local_files_only=kwargs.get("local_files_only", False),
+                subfolder=kwargs.get("subfolder", ""),
             )
-
-        return tokenizer
-
-    @classmethod
-    def _patch_mistral_regex(
-        cls,
-        tokenizer,
-        pretrained_model_name_or_path,
-        token=None,
-        cache_dir=None,
-        local_files_only=False,
-        _commit_hash=None,
-        _is_local=False,
-        init_kwargs=None,
-        fix_mistral_regex=None,
-    ):
-        """
-        Patches mistral related tokenizers with incorrect regex if detected
-            1) Local file with an associated config saved next to it
-                >> Model type one of the mistral models (on older versions)
-            2) Remote models on the hub from official mistral models
-                >> Tags including `base_model:.*mistralai`
-        """
-        from huggingface_hub import model_info
-
-        def is_base_mistral(model_id: str) -> bool:
-            model = model_info(model_id)
-            if model.tags is not None:
-                if re.search("base_model:.*mistralai", "".join(model.tags)):
-                    return True
-            return False
-
-        if _is_local or is_base_mistral(pretrained_model_name_or_path):
-            _config_file = cached_file(
+            if spm_filename is not None:
+                try:
+                    resolved_spm = cached_file(
+                        pretrained_model_name_or_path,
+                        spm_filename,
+                        cache_dir=kwargs.get("cache_dir"),
+                        force_download=kwargs.get("force_download", False),
+                        proxies=kwargs.get("proxies"),
+                        token=kwargs.get("token"),
+                        revision=kwargs.get("revision"),
+                        local_files_only=kwargs.get("local_files_only", False),
+                        subfolder=kwargs.get("subfolder", ""),
+                    )
+                except Exception:
+                    resolved_spm = None
+                if resolved_spm is not None:
+                    try:
+                        # Mirror AutoTokenizer fallback: extract vocab/merges from SentencePiece
+                        import inspect as _inspect
+
+                        from .tokenization_utils_sentencepiece import SentencePieceExtractor
+
+                        class_sig = _inspect.signature(getattr(cls, "__init__", cls))
+                        vocab_ids, vocab_scores, merges = SentencePieceExtractor(resolved_spm).extract()
+                        files_loaded = [spm_filename]
+                        init_kwargs["backend"] = "tokenizers"
+                        init_kwargs["files_loaded"] = files_loaded
+                        # If tokenizer needs merges too (BPE), pass both; unigram models only need vocab
+                        if "merges" in class_sig.parameters:
+                            return cls.from_pretrained(
+                                pretrained_model_name_or_path,
+                                *init_inputs,
+                                vocab=vocab_scores,
+                                merges=merges,
+                                **init_kwargs,
+                            )
+                        elif "vocab" in class_sig.parameters:
+                            return cls.from_pretrained(
+                                pretrained_model_name_or_path,
+                                *init_inputs,
+                                vocab=vocab_scores,
+                                **init_kwargs,
+                            )
+                    except Exception as e:
+                        logger.warning(
+                            f"Could not extract vocab/merges from the SentencePiece model to initialize a Tokenizers backend: {e}. We are falling back so we are falling back to the standard loading method."
+                        )
+                        pass
+            # Fallback to vocab.json + merges.txt (BPE) or just vocab.json (WordLevel/WordPiece)
+            vocab, merges, files_loaded = load_vocab_and_merges(
                 pretrained_model_name_or_path,
-                "config.json",
-                cache_dir=cache_dir,
-                token=token,
-                local_files_only=local_files_only,
-                _raise_exceptions_for_missing_entries=False,
-                _raise_exceptions_for_connection_errors=False,
-                _commit_hash=_commit_hash,
+                cache_dir=kwargs.get("cache_dir"),
+                force_download=kwargs.get("force_download", False),
+                proxies=kwargs.get("proxies"),
+                token=kwargs.get("token"),
+                revision=kwargs.get("revision"),
+                local_files_only=kwargs.get("local_files_only", False),
+                subfolder=kwargs.get("subfolder", ""),
             )
 
-            # Detected using a (local) mistral tokenizer
-            mistral_config_detected = False
-            if _config_file is not None:
-                with open(_config_file, encoding="utf-8") as f:
-                    _config = json.load(f)
-                transformers_version = _config.get("transformers_version")
-                transformers_model_type = _config.get("model_type")
-
-                # Detect if we can skip the mistral fix by
-                #   a) having a non-mistral tokenizer
-                #   b) fixed version of transformers
-                if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
-                    if (
-                        _is_local
-                        and transformers_model_type is not None
-                        and transformers_model_type
-                        not in [
-                            "mistral",
-                            "mistral3",
-                            "voxtral",
-                            "ministral",
-                            "pixtral",
-                        ]
-                    ):
-                        return tokenizer
-                elif transformers_version and version.parse(transformers_version) >= version.parse("5.0.0"):
-                    return tokenizer
-
-                mistral_config_detected = True
-
-            if mistral_config_detected or (not _is_local and is_base_mistral(pretrained_model_name_or_path)):
-                # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.
-                if init_kwargs and "fix_mistral_regex" in init_kwargs:
-                    setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"])
-
-                # only warn if its not explicitly passed
-                if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False):
-                    setattr(tokenizer, "fix_mistral_regex", False)
-                    logger.warning(
-                        f"The tokenizer you are loading from '{pretrained_model_name_or_path}'"
-                        f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e."
-                        " This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue."
-                    )
-                elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False):
-                    setattr(tokenizer, "fix_mistral_regex", True)
-                    import tokenizers
-
-                    tokenizer.backend_tokenizer.pre_tokenizer[0] = tokenizers.pre_tokenizers.Split(
-                        pattern=tokenizers.Regex(
-                            r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+"
-                        ),
-                        behavior="isolated",
-                    )
-        return tokenizer
+            if vocab is not None:
+                try:
+                    import inspect as _inspect
+
+                    class_sig = _inspect.signature(getattr(cls, "__init__", cls))
+                    init_kwargs["backend"] = "tokenizers"
+                    init_kwargs["files_loaded"] = files_loaded
+
+                    if merges is not None and "merges" in class_sig.parameters:
+                        return cls.from_pretrained(
+                            pretrained_model_name_or_path,
+                            *init_inputs,
+                            vocab=vocab,
+                            merges=merges,
+                            **init_kwargs,
+                        )
+                    elif "vocab" in class_sig.parameters:
+                        return cls.from_pretrained(
+                            pretrained_model_name_or_path,
+                            *init_inputs,
+                            vocab=vocab,
+                            **init_kwargs,
+                        )
+                except Exception:
+                    pass
+        if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
+            logger.info(
+                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
+                " fine-tuned or trained."
+            )
 
-    @staticmethod
-    def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
-        # This method should be deleted in Transformers v5
-        # Its only purpose is to potentially throw a warning
-        # that incorrectly defined max lengths of T5's tokenizer are used
-        # which we will correct in Transformers v5.
-        return max_model_length
+        return tokenizer
 
     @classmethod
     def convert_added_tokens(cls, obj: Union[AddedToken, Any], save=False, add_type_field=True):
@@ -2560,53 +2260,6 @@ def convert_added_tokens(cls, obj: Union[AddedToken, Any], save=False, add_type_
             return {k: cls.convert_added_tokens(v, save=save, add_type_field=add_type_field) for k, v in obj.items()}
         return obj
 
-    def save_chat_templates(
-        self,
-        save_directory: Union[str, os.PathLike],
-        tokenizer_config: dict,
-        filename_prefix: Optional[str],
-    ):
-        """
-        Writes chat templates out to the save directory if we're using the new format, and removes them from
-        the tokenizer config if present. If we're using the legacy format, it doesn't write any files, and instead
-        writes the templates to the tokenizer config in the correct format.
-        """
-        chat_template_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE
-        )
-        chat_template_dir = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_DIR
-        )
-
-        saved_raw_chat_template_files = []
-        if isinstance(self.chat_template, str):
-            # New format for single templates is to save them as chat_template.jinja
-            with open(chat_template_file, "w", encoding="utf-8") as f:
-                f.write(self.chat_template)
-            logger.info(f"chat template saved in {chat_template_file}")
-            saved_raw_chat_template_files.append(chat_template_file)
-            if "chat_template" in tokenizer_config:
-                tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
-        elif isinstance(self.chat_template, dict):
-            # New format for multiple templates is to save the default as chat_template.jinja
-            # and the other templates in the chat_templates/ directory
-            for template_name, template in self.chat_template.items():
-                if template_name == "default":
-                    with open(chat_template_file, "w", encoding="utf-8") as f:
-                        f.write(self.chat_template["default"])
-                    logger.info(f"chat template saved in {chat_template_file}")
-                    saved_raw_chat_template_files.append(chat_template_file)
-                else:
-                    Path(chat_template_dir).mkdir(exist_ok=True)
-                    template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja")
-                    with open(template_filepath, "w", encoding="utf-8") as f:
-                        f.write(template)
-                    logger.info(f"chat template saved in {template_filepath}")
-                    saved_raw_chat_template_files.append(template_filepath)
-        if "chat_template" in tokenizer_config:
-            tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
-        return tokenizer_config, saved_raw_chat_template_files
-
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
@@ -2650,6 +2303,7 @@ def save_pretrained(
         Returns:
             A tuple of `str`: The files saved.
         """
+
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
             return
@@ -2662,9 +2316,6 @@ def save_pretrained(
             repo_id = create_repo(repo_id, exist_ok=True, **kwargs).repo_id
             files_timestamps = self._get_files_timestamps(save_directory)
 
-        special_tokens_map_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
-        )
         tokenizer_config_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
         )
@@ -2674,23 +2325,22 @@ def save_pretrained(
         # Let's save the init kwargs
         target_keys = set(self.init_kwargs.keys())
         # Let's save the special tokens map (only the strings)
-        target_keys.update(["model_max_length", "clean_up_tokenization_spaces"])
+        target_keys.update(["model_max_length"])
 
         for k in target_keys:
             if hasattr(self, k):
                 tokenizer_config[k] = getattr(self, k)
 
         # Let's make sure we properly save the special tokens
+        # V5: Save both named tokens and extra tokens
         tokenizer_config.update(self.special_tokens_map)
-        if "extra_special_tokens" not in tokenizer_config:
+        if self._extra_special_tokens:
             tokenizer_config["extra_special_tokens"] = self.extra_special_tokens
-            tokenizer_config.update(self.extra_special_tokens)
 
+        save_jinja_files = kwargs.get("save_jinja_files", True)
         tokenizer_config, saved_raw_chat_template_files = self.save_chat_templates(
-            save_directory, tokenizer_config, filename_prefix
+            save_directory, tokenizer_config, filename_prefix, save_jinja_files
         )
-        if getattr(self, "response_schema", None) is not None:
-            tokenizer_config["response_schema"] = self.response_schema
 
         if len(self.init_inputs) > 0:
             tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
@@ -2699,7 +2349,6 @@ def save_pretrained(
 
         # no typefields, this way old fast and slow can load it
         tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
-
         # Process added tokens separately: allows previous versions to ignore it!
         added_tokens = {}
         for key, value in self.added_tokens_decoder.items():
@@ -2708,6 +2357,11 @@ def save_pretrained(
 
         # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
         tokenizer_class = self.__class__.__name__
+
+        # tokenizers backend don't need to save added_tokens_decoder
+        if any(base.__name__ == "TokenizersBackend" for base in self.__class__.__mro__):
+            tokenizer_config.pop("added_tokens_decoder", None)
+
         # Remove the Fast at the end if we can save the slow tokenizer
         if tokenizer_class.endswith("Fast") and getattr(self, "can_save_slow_tokenizer", False):
             tokenizer_class = tokenizer_class[:-4]
@@ -2716,7 +2370,7 @@ def save_pretrained(
             tokenizer_config["auto_map"] = self._auto_map
         if getattr(self, "_processor_class", None) is not None:
             tokenizer_config["processor_class"] = self._processor_class
-
+        tokenizer_config.pop("files_loaded", None)
         # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
         # loaded from the Hub.
         if self._auto_class is not None:
@@ -2729,6 +2383,8 @@ def save_pretrained(
             tokenizer_config.pop("tokenizer_file", None)
         if "device_map" in tokenizer_config:
             tokenizer_config.pop("device_map")
+        if "slow_tokenizer_class" in tokenizer_config:
+            tokenizer_config.pop("slow_tokenizer_class")
 
         with open(tokenizer_config_file, "w", encoding="utf-8") as f:
             out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
@@ -2737,14 +2393,7 @@ def save_pretrained(
 
         # Sanitize AddedTokens in special_tokens_map
 
-        # kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either
-        write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False)
-        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-            out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
-            f.write(out_str)
-        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
-
-        file_names = (tokenizer_config_file, special_tokens_map_file, *saved_raw_chat_template_files)
+        file_names = (tokenizer_config_file, *saved_raw_chat_template_files)
 
         save_files = self._save_pretrained(
             save_directory=save_directory,
@@ -2775,7 +2424,7 @@ def _save_pretrained(
         Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
 
         Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
-        specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
+        specific [`~tokenization_utils_tokenizers.PreTrainedTokenizerFast._save_pretrained`]
         """
         if legacy_format is False:
             raise ValueError(
@@ -2875,12 +2524,21 @@ def encode(
                 the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                 method).
         """
-        encoded_inputs = self.encode_plus(
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=kwargs.get("pad_to_multiple_of"),
+            verbose=kwargs.get("verbose", True),
+            **kwargs,
+        )
+
+        encoded_inputs = self._encode_plus(
             text,
             text_pair=text_pair,
             add_special_tokens=add_special_tokens,
-            padding=padding,
-            truncation=truncation,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
             max_length=max_length,
             stride=stride,
             padding_side=padding_side,
@@ -2893,6 +2551,46 @@ def encode(
     def num_special_tokens_to_add(self, pair: bool = False) -> int:
         raise NotImplementedError
 
+    @property
+    def max_len_single_sentence(self) -> int:
+        """
+        `int`: The maximum length of a sentence that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
+
+    @max_len_single_sentence.setter
+    def max_len_single_sentence(self, value) -> None:
+        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_single_sentence", False):
+                logger.warning(
+                    "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_single_sentence"] = True
+        else:
+            raise ValueError(
+                "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up."
+            )
+
+    @property
+    def max_len_sentences_pair(self) -> int:
+        """
+        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
+
+    @max_len_sentences_pair.setter
+    def max_len_sentences_pair(self, value) -> None:
+        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
+                logger.warning(
+                    "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_sentences_pair"] = True
+        else:
+            raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.")
+
     def _get_padding_truncation_strategies(
         self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
     ):
@@ -2900,19 +2598,9 @@ def _get_padding_truncation_strategies(
         Find the correct padding/truncation strategy
         """
 
-        # Backward compatibility for previous behavior, maybe we should deprecate it:
+        # Backward compatibility for previous behavior:
         # If you only set max_length, it activates truncation for max_length
         if max_length is not None and padding is False and truncation is None:
-            if verbose:
-                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
-                    logger.warning(
-                        "Truncation was not explicitly activated but `max_length` is provided a specific value, please"
-                        " use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
-                        " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
-                        " tokenizer you can select this strategy more precisely by providing a specific strategy to"
-                        " `truncation`."
-                    )
-                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
             truncation = "longest_first"
 
         # Get padding strategy
@@ -2951,26 +2639,12 @@ def _get_padding_truncation_strategies(
         if max_length is None:
             if padding_strategy == PaddingStrategy.MAX_LENGTH:
                 if self.model_max_length > LARGE_INTEGER:
-                    if verbose:
-                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
-                            logger.warning(
-                                "Asking to pad to max_length but no maximum length is provided and the model has no"
-                                " predefined maximum length. Default to no padding."
-                            )
-                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
                     padding_strategy = PaddingStrategy.DO_NOT_PAD
                 else:
                     max_length = self.model_max_length
 
             if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                 if self.model_max_length > LARGE_INTEGER:
-                    if verbose:
-                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
-                            logger.warning(
-                                "Asking to truncate to max_length but no maximum length is provided and the model has"
-                                " no predefined maximum length. Default to no truncation."
-                            )
-                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
                     truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                 else:
                     max_length = self.model_max_length
@@ -2988,338 +2662,25 @@ def _get_padding_truncation_strategies(
             truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
             and padding_strategy != PaddingStrategy.DO_NOT_PAD
             and pad_to_multiple_of is not None
-            and max_length is not None
-            and (max_length % pad_to_multiple_of != 0)
-        ):
-            raise ValueError(
-                "Truncation and padding are both activated but "
-                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
-            )
-
-        return padding_strategy, truncation_strategy, max_length, kwargs
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def __call__(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput], None] = None,
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput], None] = None,
-        text_pair_target: Optional[
-            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
-        ] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy, None] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
-        sequences.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
-                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
-                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
-                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-        """
-        # To avoid duplicating
-        all_kwargs = {
-            "add_special_tokens": add_special_tokens,
-            "padding": padding,
-            "truncation": truncation,
-            "max_length": max_length,
-            "stride": stride,
-            "is_split_into_words": is_split_into_words,
-            "pad_to_multiple_of": pad_to_multiple_of,
-            "padding_side": padding_side,
-            "return_tensors": return_tensors,
-            "return_token_type_ids": return_token_type_ids,
-            "return_attention_mask": return_attention_mask,
-            "return_overflowing_tokens": return_overflowing_tokens,
-            "return_special_tokens_mask": return_special_tokens_mask,
-            "return_offsets_mapping": return_offsets_mapping,
-            "return_length": return_length,
-            "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
-            "verbose": verbose,
-        }
-
-        all_kwargs.update(kwargs)
-        if text is None and text_target is None:
-            raise ValueError("You need to specify either `text` or `text_target`.")
-        if text is not None:
-            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
-            # input mode in this case.
-            if not self._in_target_context_manager:
-                self._switch_to_input_mode()
-            encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
-        if text_target is not None:
-            self._switch_to_target_mode()
-            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
-        # Leave back tokenizer in input mode
-        self._switch_to_input_mode()
-
-        if text_target is None:
-            return encodings
-        elif text is None:
-            return target_encodings
-        else:
-            encodings["labels"] = target_encodings["input_ids"]
-            return encodings
-
-    def _call_one(
-        self,
-        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy, None] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        split_special_tokens: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        # Input type checking for clearer error
-        def _is_valid_text_input(t):
-            if isinstance(t, str):
-                # Strings are fine
-                return True
-            elif isinstance(t, (list, tuple)):
-                # List are fine as long as they are...
-                if len(t) == 0:
-                    # ... empty
-                    return True
-                elif isinstance(t[0], str):
-                    # ... list of strings
-                    return True
-                elif isinstance(t[0], (list, tuple)):
-                    # ... list with an empty list or with a list of strings
-                    return len(t[0]) == 0 or isinstance(t[0][0], str)
-                else:
-                    return False
-            else:
-                return False
-
-        if not _is_valid_text_input(text):
-            raise ValueError(
-                "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
-                "or `list[list[str]]` (batch of pretokenized examples)."
-            )
-
-        if text_pair is not None and not _is_valid_text_input(text_pair):
-            raise ValueError(
-                "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
-                "or `list[list[str]]` (batch of pretokenized examples)."
-            )
-
-        if is_split_into_words:
-            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
-        else:
-            is_batched = isinstance(text, (list, tuple))
-
-        if is_batched:
-            if isinstance(text_pair, str):
-                raise TypeError(
-                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
-                    " `text`."
-                )
-            if text_pair is not None and len(text) != len(text_pair):
-                raise ValueError(
-                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
-                    f" {len(text_pair)}."
-                )
-            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-            return self.batch_encode_plus(
-                batch_text_or_text_pairs=batch_text_or_text_pairs,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                is_split_into_words=is_split_into_words,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                split_special_tokens=split_special_tokens,
-                **kwargs,
-            )
-        else:
-            return self.encode_plus(
-                text=text,
-                text_pair=text_pair,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                is_split_into_words=is_split_into_words,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_tensors=return_tensors,
-                return_token_type_ids=return_token_type_ids,
-                return_attention_mask=return_attention_mask,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_mask,
-                return_offsets_mapping=return_offsets_mapping,
-                return_length=return_length,
-                verbose=verbose,
-                split_special_tokens=split_special_tokens,
-                **kwargs,
-            )
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput, EncodedInput],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy, None] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Tokenize and prepare for the model a sequence or a pair of sequences.
-
-        
-
-        This method is deprecated, `__call__` should be used instead.
-
-        
-
-        Args:
-            text (`str`, `list[str]` or (for non-fast tokenizers) `list[int]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-            text_pair (`str`, `list[str]` or `list[int]`, *optional*):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method).
-        """
-
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        return self._encode_plus(
-            text=text,
-            text_pair=text_pair,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            split_special_tokens=kwargs.pop("split_special_tokens", self.split_special_tokens),
-            **kwargs,
-        )
+            and max_length is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            raise ValueError(
+                "Truncation and padding are both activated but "
+                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
+            )
 
-    def _encode_plus(
-        self,
-        text: Union[TextInput, PreTokenizedInput, EncodedInput],
-        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        split_special_tokens: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        raise NotImplementedError
+        return padding_strategy, truncation_strategy, max_length, kwargs
 
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def batch_encode_plus(
+    def __call__(
         self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-            list[PreTokenizedInputPair],
-            list[EncodedInput],
-            list[EncodedInputPair],
-        ],
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput], None] = None,
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput], None] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+        ] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy, None] = None,
@@ -3336,67 +2697,113 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        split_special_tokens: bool = False,
+        tokenizer_kwargs: Optional[dict[str, Any]] = None,
         **kwargs,
     ) -> BatchEncoding:
         """
-        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
-
-        
-
-        This method is deprecated, `__call__` should be used instead.
-
-        
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences.
 
         Args:
-            batch_text_or_text_pairs (`list[str]`, `list[tuple[str, str]]`, `list[list[str]]`, `list[tuple[list[str], list[str]]]`, and for not-fast tokenizers, also `list[list[int]]`, `list[tuple[list[int], list[int]]]`):
-                Batch of sequences or pair of sequences to be encoded. This can be a list of
-                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
-                details in `encode_plus`).
+            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            tokenizer_kwargs (`dict[str, Any]`, *optional*):
+                Additional kwargs to pass to the tokenizer. These will be merged with the explicit parameters and
+                other kwargs, with explicit parameters taking precedence.
         """
+        # To avoid duplicating
+        all_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "padding": padding,
+            "truncation": truncation,
+            "max_length": max_length,
+            "stride": stride,
+            "is_split_into_words": is_split_into_words,
+            "pad_to_multiple_of": pad_to_multiple_of,
+            "padding_side": padding_side,
+            "return_tensors": return_tensors,
+            "return_token_type_ids": return_token_type_ids,
+            "return_attention_mask": return_attention_mask,
+            "return_overflowing_tokens": return_overflowing_tokens,
+            "return_special_tokens_mask": return_special_tokens_mask,
+            "return_offsets_mapping": return_offsets_mapping,
+            "return_length": return_length,
+            "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens),
+            "verbose": verbose,
+        }
+
+        max_target_length = kwargs.pop("max_target_length", None)
+
+        # First merge tokenizer_kwargs, then other kwargs (explicit params take precedence)
+        if tokenizer_kwargs is not None:
+            all_kwargs.update(tokenizer_kwargs)
+        all_kwargs.update(kwargs)
+        if text is None and text_target is None:
+            raise ValueError("You need to specify either `text` or `text_target`.")
 
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
         padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
+            padding=all_kwargs.pop("padding", False),
+            truncation=all_kwargs.pop("truncation", None),
+            max_length=all_kwargs.pop("max_length", None),
+            pad_to_multiple_of=all_kwargs.get("pad_to_multiple_of"),
+            verbose=all_kwargs.get("verbose", True),
             **kwargs,
         )
 
-        return self._batch_encode_plus(
-            batch_text_or_text_pairs=batch_text_or_text_pairs,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            is_split_into_words=is_split_into_words,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            split_special_tokens=split_special_tokens,
-            **kwargs,
-        )
+        if text is not None:
+            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
+            # input mode in this case.
+            if not self._in_target_context_manager and hasattr(self, "_switch_to_input_mode"):
+                self._switch_to_input_mode()
+            encodings = self._encode_plus(
+                text=text,
+                text_pair=text_pair,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_length,
+                **all_kwargs,
+            )
+        if text_target is not None:
+            if hasattr(self, "_switch_to_target_mode"):
+                self._switch_to_target_mode()
+            target_encodings = self._encode_plus(
+                text=text_target,
+                text_pair=text_pair_target,
+                padding_strategy=padding_strategy,
+                truncation_strategy=truncation_strategy,
+                max_length=max_target_length if max_target_length is not None else max_length,
+                **all_kwargs,
+            )
+            # Leave back tokenizer in input mode
+            if hasattr(self, "_switch_to_input_mode"):
+                self._switch_to_input_mode()
+
+        if text_target is None:
+            return encodings
+        elif text is None:
+            return target_encodings
+        else:
+            encodings["labels"] = target_encodings["input_ids"]
+            return encodings
 
-    def _batch_encode_plus(
+    def _encode_plus(
         self,
-        batch_text_or_text_pairs: Union[
-            list[TextInput],
-            list[TextInputPair],
-            list[PreTokenizedInput],
-            list[PreTokenizedInputPair],
-            list[EncodedInput],
-            list[EncodedInputPair],
-        ],
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
@@ -3495,14 +2902,6 @@ def pad(
             verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
         """
-        if self.__class__.__name__.endswith("Fast"):
-            if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False):
-                logger.warning_advice(
-                    f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer,"
-                    " using the `__call__` method is faster than using a method to encode the text followed by a call"
-                    " to the `pad` method to get a padded encoding."
-                )
-                self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
 
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
@@ -3574,336 +2973,30 @@ def pad(
         batch_size = len(required_input)
         assert all(len(v) == batch_size for v in encoded_inputs.values()), (
             "Some items in the output dictionary have a different batch size than others."
-        )
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = max(len(inputs) for inputs in required_input)
-            padding_strategy = PaddingStrategy.MAX_LENGTH
-
-        batch_outputs = {}
-        for i in range(batch_size):
-            inputs = {k: v[i] for k, v in encoded_inputs.items()}
-            outputs = self._pad(
-                inputs,
-                max_length=max_length,
-                padding_strategy=padding_strategy,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Create the token type IDs corresponding to the sequences passed. [What are token type
-        IDs?](../glossary#token-type-ids)
-
-        Should be overridden in a subclass if the model has a special way of building those.
-
-        Args:
-            token_ids_0 (`list[int]`): The first tokenized sequence.
-            token_ids_1 (`list[int]`, *optional*): The second tokenized sequence.
-
-        Returns:
-            `list[int]`: The token type ids.
-        """
-        cls_len = int(getattr(self, "cls_token_id", None) is not None)
-        sep_len = int(getattr(self, "sep_token_id", None) is not None)
-
-        if token_ids_1 is None:
-            return [0] * (cls_len + len(token_ids_0) + sep_len)
-
-        return [0] * (cls_len + len(token_ids_0) + sep_len) + [1] * (len(token_ids_1) + sep_len)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
-    ) -> list[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens.
-
-        This implementation does not add special tokens and this method should be overridden in a subclass.
-
-        Args:
-            token_ids_0 (`list[int]`): The first tokenized sequence.
-            token_ids_1 (`list[int]`, *optional*): The second tokenized sequence.
-
-        Returns:
-            `list[int]`: The model input with special tokens.
-        """
-        if token_ids_1 is None:
-            return token_ids_0
-        return token_ids_0 + token_ids_1
-
-    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
-    def prepare_for_model(
-        self,
-        ids: list[int],
-        pair_ids: Optional[list[int]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = False,
-        truncation: Union[bool, str, TruncationStrategy, None] = None,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        padding_side: Optional[str] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        prepend_batch_axis: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
-        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
-        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
-        overflowing tokens. Such a combination of arguments will raise an error.
-
-        Args:
-            ids (`list[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            pair_ids (`list[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-        """
-
-        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
-        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
-            padding=padding,
-            truncation=truncation,
-            max_length=max_length,
-            pad_to_multiple_of=pad_to_multiple_of,
-            verbose=verbose,
-            **kwargs,
-        )
-
-        pair = pair_ids is not None
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-
-        if return_token_type_ids and not add_special_tokens:
-            raise ValueError(
-                "Asking to return token_type_ids while setting add_special_tokens to False "
-                "results in an undefined behavior. Please set add_special_tokens to True or "
-                "set return_token_type_ids to None."
-            )
-
-        if (
-            return_overflowing_tokens
-            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
-            and pair_ids is not None
-        ):
-            raise ValueError(
-                "Not possible to return overflowing tokens for pair of sequences with the "
-                "`longest_first`. Please select another truncation strategy than `longest_first`, "
-                "for instance `only_second` or `only_first`."
-            )
-
-        # Load from model defaults
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        # Compute the total size of the returned encodings
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-
-        # Truncation: Handle max sequence length
-        overflowing_tokens = []
-        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
-            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
-                ids,
-                pair_ids=pair_ids,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-
-        if return_overflowing_tokens:
-            encoded_inputs["overflowing_tokens"] = overflowing_tokens
-            encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Add special tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
-
-        # Build output dictionary
-        encoded_inputs["input_ids"] = sequence
-        if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-        if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-        # Check lengths
-        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
-
-        # Padding
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
-            encoded_inputs = self.pad(
-                encoded_inputs,
-                max_length=max_length,
-                padding=padding_strategy.value,
-                pad_to_multiple_of=pad_to_multiple_of,
-                padding_side=padding_side,
-                return_attention_mask=return_attention_mask,
-            )
-
-        if return_length:
-            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
-
-        batch_outputs = BatchEncoding(
-            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
-        )
-
-        return batch_outputs
-
-    def truncate_sequences(
-        self,
-        ids: list[int],
-        pair_ids: Optional[list[int]] = None,
-        num_tokens_to_remove: int = 0,
-        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
-        stride: int = 0,
-    ) -> tuple[list[int], list[int], list[int]]:
-        """
-        Truncates a sequence pair in-place following the strategy.
-
-        Args:
-            ids (`list[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
-                `convert_tokens_to_ids` methods.
-            pair_ids (`list[int]`, *optional*):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
-                and `convert_tokens_to_ids` methods.
-            num_tokens_to_remove (`int`, *optional*, defaults to 0):
-                Number of tokens to remove using the truncation strategy.
-            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`):
-                The strategy to follow for truncation. Can be:
-
-                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will truncate
-                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
-                  batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
-                  than the model maximum admissible input size).
-            stride (`int`, *optional*, defaults to 0):
-                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
-                sequence returned. The value of this argument defines the number of additional tokens.
-
-        Returns:
-            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
-            overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair
-            of sequences (or a batch of pairs) is provided.
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, pair_ids, []
-
-        if not isinstance(truncation_strategy, TruncationStrategy):
-            truncation_strategy = TruncationStrategy(truncation_strategy)
+        )
 
-        overflowing_tokens = []
-        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
-            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
-        ):
-            if len(ids) > num_tokens_to_remove:
-                window_len = min(len(ids), stride + num_tokens_to_remove)
-                if self.truncation_side == "left":
-                    overflowing_tokens = ids[:window_len]
-                    ids = ids[num_tokens_to_remove:]
-                elif self.truncation_side == "right":
-                    overflowing_tokens = ids[-window_len:]
-                    ids = ids[:-num_tokens_to_remove]
-                else:
-                    raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.")
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
 
-            else:
-                error_msg = (
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the first sequence has a length {len(ids)}. "
-                )
-                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
-                    error_msg = (
-                        error_msg + "Please select another truncation strategy than "
-                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
-                    )
-                logger.error(error_msg)
-        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            logger.warning(
-                "Be aware, overflowing tokens are not returned for the setting you have chosen,"
-                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
-                "truncation strategy. So the returned list will always be empty even if some "
-                "tokens have been removed."
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = {k: v[i] for k, v in encoded_inputs.items()}
+            outputs = self._pad(
+                inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
             )
-            len_pair_ids = len(pair_ids) if pair_ids is not None else 0
-            len_ids = len(ids)
-            first_remove = min(abs(len_pair_ids - len_ids), num_tokens_to_remove)
-            second_remove = num_tokens_to_remove - first_remove
-            if len_ids > len_pair_ids:
-                ids_to_move = first_remove + second_remove // 2
-                pair_ids_to_move = second_remove - second_remove // 2
-            else:
-                ids_to_move = second_remove // 2
-                pair_ids_to_move = first_remove + second_remove - (second_remove // 2)
-
-            if self.truncation_side == "right":
-                ids = ids[:-ids_to_move] if ids_to_move > 0 else ids
-                pair_ids = pair_ids[:-pair_ids_to_move] if pair_ids is not None and pair_ids_to_move > 0 else pair_ids
-            elif self.truncation_side == "left":
-                ids = ids[ids_to_move:]
-                pair_ids = pair_ids[pair_ids_to_move:] if pair_ids is not None else None
-            else:
-                raise ValueError(f"invalid truncation strategy:{self.truncation_side}")
-
-        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
-            if len(pair_ids) > num_tokens_to_remove:
-                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-                if self.truncation_side == "right":
-                    overflowing_tokens = pair_ids[-window_len:]
-                    pair_ids = pair_ids[:-num_tokens_to_remove]
-                elif self.truncation_side == "left":
-                    overflowing_tokens = pair_ids[:window_len]
-                    pair_ids = pair_ids[num_tokens_to_remove:]
-                else:
-                    raise ValueError(f"invalid truncation strategy:{self.truncation_side}")
-            else:
-                logger.error(
-                    f"We need to remove {num_tokens_to_remove} to truncate the input "
-                    f"but the second sequence has a length {len(pair_ids)}. "
-                    f"Please select another truncation strategy than {truncation_strategy}, "
-                    "for instance 'longest_first' or 'only_first'."
-                )
 
-        return (ids, pair_ids, overflowing_tokens)
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
 
     def _pad(
         self,
@@ -4000,55 +3093,69 @@ def convert_tokens_to_string(self, tokens: list[str]) -> str:
         """
         raise NotImplementedError
 
-    def batch_decode(
+    def decode(
         self,
-        sequences: Union[list[int], list[list[int]], np.ndarray, torch.Tensor],
+        token_ids: Union[int, list[int], list[list[int]], np.ndarray, "torch.Tensor"],
         skip_special_tokens: bool = False,
-        clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
-    ) -> list[str]:
+    ) -> Union[str, list[str]]:
         """
-        Convert a list of lists of token ids into a list of strings by calling decode.
+        Converts a sequence of ids into a string, or a list of sequences into a list of strings,
+        using the tokenizer and vocabulary with options to remove special tokens and clean up
+        tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
 
         Args:
-            sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
-                List of tokenized input ids. Can be obtained using the `__call__` method.
+            token_ids (`Union[int, list[int], list[list[int]], np.ndarray, torch.Tensor]`):
+                A single sequence or a batch (list of sequences) of tokenized input ids. Can be obtained using the
+                `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (`bool`, *optional*):
-                Whether or not to clean up the tokenization spaces. If `None`, will default to
-                `self.clean_up_tokenization_spaces`.
             kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            `list[str]`: The list of decoded sentences.
+            `Union[str, list[str]]`: The decoded string for a single sequence, or a list of decoded strings for a
+            batch of sequences.
         """
-        return [
-            self.decode(
-                seq,
-                skip_special_tokens=skip_special_tokens,
-                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
-                **kwargs,
-            )
-            for seq in sequences
-        ]
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
 
-    def decode(
+        # If we received batched input, decode each sequence
+        if isinstance(token_ids, (list, tuple)) and len(token_ids) > 0 and isinstance(token_ids[0], (list, tuple)):
+            clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
+            return [
+                self._decode(
+                    token_ids=seq,
+                    skip_special_tokens=skip_special_tokens,
+                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                    **kwargs,
+                )
+                for seq in token_ids
+            ]
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            **kwargs,
+        )
+
+    def batch_decode(
         self,
-        token_ids: Union[int, list[int], np.ndarray, torch.Tensor],
+        sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
         skip_special_tokens: bool = False,
         clean_up_tokenization_spaces: Optional[bool] = None,
         **kwargs,
-    ) -> str:
+    ) -> list[str]:
         """
-        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
-        tokens and clean up tokenization spaces.
+        Convert a list of lists of token ids into a list of strings by calling decode.
 
-        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+        This method is provided for backwards compatibility. The `decode` method now handles batched input natively,
+        so you can use `decode` directly instead of `batch_decode`.
 
         Args:
-            token_ids (`Union[int, list[int], np.ndarray, torch.Tensor]`):
+            sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor]`):
                 List of tokenized input ids. Can be obtained using the `__call__` method.
             skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
@@ -4059,17 +3166,19 @@ def decode(
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            `str`: The decoded sentence.
+            `list[str]`: The list of decoded sentences.
         """
-        # Convert inputs to python lists
-        token_ids = to_py_obj(token_ids)
-
-        return self._decode(
-            token_ids=token_ids,
+        # Forward to decode() which now handles batched input natively
+        result = self.decode(
+            token_ids=sequences,
             skip_special_tokens=skip_special_tokens,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             **kwargs,
         )
+        # Ensure we always return a list for backwards compatibility
+        if isinstance(result, str):
+            return [result]
+        return result
 
     def _decode(
         self,
@@ -4080,62 +3189,6 @@ def _decode(
     ) -> str:
         raise NotImplementedError
 
-    def get_special_tokens_mask(
-        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
-    ) -> list[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
-
-        Args:
-            token_ids_0 (`list[int]`):
-                List of ids of the first sequence.
-            token_ids_1 (`list[int]`, *optional*):
-                List of ids of the second sequence.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        assert already_has_special_tokens and token_ids_1 is None, (
-            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
-            "Please use a slow (full python) tokenizer to activate this argument. "
-            "Or set `return_special_tokens_mask=True` when calling the encoding method "
-            "to get the special tokens mask in any tokenizer. "
-        )
-
-        all_special_ids = self.all_special_ids  # cache the property
-
-        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
-
-        return special_tokens_mask
-
-    @staticmethod
-    def clean_up_tokenization(out_string: str) -> str:
-        """
-        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
-
-        Args:
-            out_string (`str`): The text to clean up.
-
-        Returns:
-            `str`: The cleaned-up string.
-        """
-        out_string = (
-            out_string.replace(" .", ".")
-            .replace(" ?", "?")
-            .replace(" !", "!")
-            .replace(" ,", ",")
-            .replace(" ' ", "'")
-            .replace(" n't", "n't")
-            .replace(" 'm", "'m")
-            .replace(" 's", "'s")
-            .replace(" 've", "'ve")
-            .replace(" 're", "'re")
-        )
-        return out_string
-
     def _eventual_warn_about_too_long_sequence(self, ids: list[int], max_length: Optional[int], verbose: bool):
         """
         Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
@@ -4156,175 +3209,365 @@ def _eventual_warn_about_too_long_sequence(self, ids: list[int], max_length: Opt
                 )
             self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
 
-    def _switch_to_input_mode(self):
-        """
-        Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
         """
+        Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
+        library are already mapped with `AutoTokenizer`.
 
-    def _switch_to_target_mode(self):
-        """
-        Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
+        Args:
+            auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
+                The auto class to register this new tokenizer with.
         """
+        if not isinstance(auto_class, str):
+            auto_class = auto_class.__name__
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
+        import transformers.models.auto as auto_module
+
+        if not hasattr(auto_module, auto_class):
+            raise ValueError(f"{auto_class} is not a valid auto class.")
+
+        cls._auto_class = auto_class
+
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        tools: Optional[list[Union[dict, Callable]]] = None,
+        documents: Optional[list[dict[str, str]]] = None,
+        chat_template: Optional[str] = None,
+        add_generation_prompt: bool = False,
+        continue_final_message: bool = False,
+        tokenize: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_dict: bool = False,
+        return_assistant_tokens_mask: bool = False,
+        tokenizer_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]:
         """
-        warnings.warn(
-            "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
-            "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
-            "your input texts if you use the same keyword arguments, or in a separate call."
-        )
-        self._switch_to_target_mode()
-        self._in_target_context_manager = True
-        yield
-        self._in_target_context_manager = False
-        self._switch_to_input_mode()
+        Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token
+        ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to
+        determine the format and control tokens to use when converting.
 
-    @classmethod
-    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
+        Args:
+            conversation (Union[list[dict[str, str]], list[list[dict[str, str]]]]): A list of dicts
+                with "role" and "content" keys, representing the chat history so far.
+            tools (`list[Union[Dict, Callable]]`, *optional*):
+                A list of tools (callable functions) that will be accessible to the model. If the template does not
+                support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
+                giving the name, description and argument types for the tool. See our
+                [tool use guide](https://huggingface.co/docs/transformers/en/chat_extras#passing-tools)
+                for more information.
+            documents (`list[dict[str, str]]`, *optional*):
+                A list of dicts representing documents that will be accessible to the model if it is performing RAG
+                (retrieval-augmented generation). If the template does not support RAG, this argument will have no
+                effect. We recommend that each document should be a dict containing "title" and "text" keys.
+            chat_template (`str`, *optional*):
+                A Jinja template to use for this conversion. It is usually not necessary to pass anything to this
+                argument, as the model's template will be used by default.
+            add_generation_prompt (bool, *optional*):
+                If this is set, a prompt with the token(s) that indicate
+                the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model.
+                Note that this argument will be passed to the chat template, and so it must be supported in the
+                template for this argument to have any effect.
+            continue_final_message (bool, *optional*):
+                If this is set, the chat will be formatted so that the final
+                message in the chat is open-ended, without any EOS tokens. The model will continue this message
+                rather than starting a new one. This allows you to "prefill" part of
+                the model's response for it. Cannot be used at the same time as `add_generation_prompt`.
+            tokenize (`bool`, defaults to `True`):
+                Whether to tokenize the output. If `False`, the output will be a string.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, defaults to `False`):
+                Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`.
+            max_length (`int`, *optional*):
+                Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If
+                not specified, the tokenizer's `max_length` attribute will be used as a default.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable
+                values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+            return_dict (`bool`, defaults to `False`):
+                Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+            tokenizer_kwargs (`dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer.
+            return_assistant_tokens_mask (`bool`, defaults to `False`):
+                Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant,
+                the mask will contain 1. For user and system tokens, the mask will contain 0.
+                This functionality is only available for chat templates that support it via the `{% generation %}` keyword.
+            **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template.
+
+        Returns:
+            `Union[list[int], Dict]`: A list of token ids representing the tokenized chat so far, including control tokens. This
+            output is ready to pass to the model, either directly or via methods like `generate()`. If `return_dict` is
+            set, will return a dict of tokenizer outputs instead.
         """
-        Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the
-        library are already mapped with `AutoTokenizer`.
 
+        if return_dict and not tokenize:
+            raise ValueError(
+                "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
+                "of tokenizer outputs to return."
+            )
+
+        if return_assistant_tokens_mask and not return_dict:
+            raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`")
+
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+
+        chat_template = self.get_chat_template(chat_template, tools)
+
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages")
+        ):
+            conversations = conversation
+            is_batched = True
+        else:
+            conversations = [conversation]
+            is_batched = False
+
+        if continue_final_message:
+            if add_generation_prompt:
+                raise ValueError(
+                    "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead."
+                )
+            if return_assistant_tokens_mask:
+                raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.")
+
+        template_kwargs = {**self.special_tokens_map, **kwargs}  # kwargs overwrite special tokens if both are present
+        rendered_chat, generation_indices = render_jinja_template(
+            conversations=conversations,
+            tools=tools,
+            documents=documents,
+            chat_template=chat_template,
+            return_assistant_tokens_mask=return_assistant_tokens_mask,
+            continue_final_message=continue_final_message,
+            add_generation_prompt=add_generation_prompt,
+            **template_kwargs,
+        )
+
+        if not is_batched:
+            rendered_chat = rendered_chat[0]
 
+        if tokenize:
+            out = self(
+                rendered_chat,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                add_special_tokens=False,
+                return_tensors=return_tensors,
+                **tokenizer_kwargs,
+            )
+            if return_dict:
+                if return_assistant_tokens_mask:
+                    assistant_masks = []
+                    if is_batched or return_tensors:
+                        input_ids = out["input_ids"]
+                    else:
+                        input_ids = [out["input_ids"]]
+                    for i in range(len(input_ids)):
+                        current_mask = [0] * len(input_ids[i])
+                        for assistant_start_char, assistant_end_char in generation_indices[i]:
+                            start_token = out.char_to_token(i, assistant_start_char)
+                            end_token = out.char_to_token(i, assistant_end_char - 1)
+                            if start_token is None:
+                                # start_token is out of bounds maybe due to truncation.
+                                break
+                            for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])):
+                                current_mask[token_id] = 1
+                        assistant_masks.append(current_mask)
 
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`):
-                The auto class to register this new tokenizer with.
-        """
-        if not isinstance(auto_class, str):
-            auto_class = auto_class.__name__
+                    if not is_batched and not return_tensors:
+                        assistant_masks = assistant_masks[0]
 
-        import transformers.models.auto as auto_module
+                    out["assistant_masks"] = assistant_masks
 
-        if not hasattr(auto_module, auto_class):
-            raise ValueError(f"{auto_class} is not a valid auto class.")
+                    if return_tensors:
+                        out.convert_to_tensors(tensor_type=return_tensors)
 
-        cls._auto_class = auto_class
+                return out
+            else:
+                return out["input_ids"]
+        else:
+            return rendered_chat
 
-    def prepare_seq2seq_batch(
+    def encode_message_with_chat_template(
         self,
-        src_texts: list[str],
-        tgt_texts: Optional[list[str]] = None,
-        max_length: Optional[int] = None,
-        max_target_length: Optional[int] = None,
-        padding: str = "longest",
-        return_tensors: Optional[str] = None,
-        truncation: bool = True,
+        message: dict[str, str],
+        conversation_history: Optional[list[dict[str, str]]] = None,
         **kwargs,
-    ) -> BatchEncoding:
+    ) -> list[int]:
         """
-        Prepare model inputs for translation. For best performance, translate one sentence at a time.
-
-        Arguments:
-            src_texts (`list[str]`):
-                List of documents to summarize or source language texts.
-            tgt_texts (`list`, *optional*):
-                List of summaries or target language texts.
-            max_length (`int`, *optional*):
-                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
-                left unset or set to `None`, this will use the predefined model maximum length if a maximum length is
-                required by one of the truncation/padding parameters. If the model has no specific maximum input length
-                (like XLNet) truncation/padding to a maximum length will be deactivated.
-            max_target_length (`int`, *optional*):
-                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
-                to `None`, this will use the max_length value.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Activates and controls padding. Accepts the following values:
-
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors instead of list of python integers. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return Numpy `np.ndarray` objects.
-            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `True`):
-                Activates and controls truncation. Accepts the following values:
-
-                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
-                  to the maximum acceptable input length for the model if that argument is not provided. This will
-                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                  sequences (or a batch of pairs) is provided.
-                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
-                  maximum acceptable input length for the model if that argument is not provided. This will only
-                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
-                  greater than the model maximum admissible input size).
+        Tokenize a single message. This method is a convenience wrapper around `apply_chat_template` that allows you
+        to tokenize messages one by one. This is useful for things like token-by-token streaming.
+        This method is not guaranteed to be perfect. For some models, it may be impossible to robustly tokenize
+        single messages. For example, if the chat template adds tokens after each message, but also has a prefix that
+        is added to the entire chat, it will be impossible to distinguish a chat-start-token from a message-start-token.
+        In these cases, this method will do its best to find the correct tokenization, but it may not be perfect.
+        **Note:** This method does not support `add_generation_prompt`. If you want to add a generation prompt,
+        you should do it separately after tokenizing the conversation.
+        Args:
+            message (`dict`):
+                A dictionary with "role" and "content" keys, representing the message to tokenize.
+            conversation_history (`list[dict]`, *optional*):
+                A list of dicts with "role" and "content" keys, representing the chat history so far. If you are
+                tokenizing messages one by one, you should pass the previous messages in the conversation here.
             **kwargs:
-                Additional keyword arguments passed along to `self.__call__`.
+                Additional kwargs to pass to the `apply_chat_template` method.
+        Returns:
+            `list[int]`: A list of token ids representing the tokenized message.
+        """
+        if "add_generation_prompt" in kwargs:
+            raise ValueError(
+                "`encode_message_with_chat_template` does not support `add_generation_prompt`. Please add the generation prompt "
+                "separately."
+            )
 
-        Return:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+        if conversation_history is None or len(conversation_history) == 0:
+            return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs)
+
+        conversation = conversation_history + [message]
+        tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs)
 
-            - **input_ids** -- List of token ids to be fed to the encoder.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
-            - **labels** -- List of token ids for tgt_texts.
+        prefix_tokens = self.apply_chat_template(
+            conversation_history, add_generation_prompt=False, tokenize=True, **kwargs
+        )
+        # It's possible that the prefix tokens are not a prefix of the full list of tokens.
+        # For example, if the prefix is `User: Hi` and the full conversation is `User: HiAssistant: Hello`.
+        # In this case, we can't simply find the prefix, so we have to do something a bit more subtle.
+        # We look for the first place where the tokens differ, and that's our split point.
+        # This is not perfect, but it's the best we can do without a token-level API.
+        # To make this more robust, we could do a diff and find the longest common subsequence, but this is
+        # a good first approximation.
+        # This is particularly important for models like Llama3 that have changed their chat template to include
+        # EOS tokens after user messages.
+        min_len = min(len(prefix_tokens), len(tokens))
+        for i in range(min_len):
+            if prefix_tokens[i] != tokens[i]:
+                return tokens[i:]
+        return tokens[min_len:]
 
-            The full set of keys `[input_ids, attention_mask, labels]`, will only be returned if tgt_texts is passed.
-            Otherwise, input_ids, attention_mask will be the only keys.
+    def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[list[dict]] = None) -> str:
         """
-        # docstyle-ignore
-        formatted_warning = """
-`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
-`__call__` method to prepare your inputs and targets.
+        Retrieve the chat template string used for tokenizing chat messages. This template is used
+        internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat
+        template for better generation tracking.
 
-Here is a short example:
+        Args:
+            chat_template (`str`, *optional*):
+                A Jinja template or the name of a template to use for this conversion.
+                It is usually not necessary to pass anything to this argument,
+                as the model's template will be used by default.
+            tools (`list[Dict]`, *optional*):
+                A list of tools (callable functions) that will be accessible to the model. If the template does not
+                support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema,
+                giving the name, description and argument types for the tool. See our
+                [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use)
+                for more information.
 
-model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)
+        Returns:
+            `str`: The chat template string.
+        """
+        # First, handle the cases when the model has a dict of multiple templates
+        if isinstance(self.chat_template, dict):
+            template_dict = self.chat_template
+            if chat_template is not None and chat_template in template_dict:
+                # The user can pass the name of a template to the chat template argument instead of an entire template
+                chat_template = template_dict[chat_template]
+            elif chat_template is None:
+                if tools is not None and "tool_use" in template_dict:
+                    chat_template = template_dict["tool_use"]
+                elif "default" in template_dict:
+                    chat_template = template_dict["default"]
+                else:
+                    raise ValueError(
+                        "This model has multiple chat templates with no default specified! Please either pass a chat "
+                        "template or the name of the template you wish to use to the `chat_template` argument. Available "
+                        f"template names are {sorted(template_dict.keys())}."
+                    )
 
-If you either need to use different keyword arguments for the source and target texts, you should do two calls like
-this:
+        elif chat_template is None:
+            # These are the cases when the model has a single template
+            # priority: `chat_template` argument > `tokenizer.chat_template`
+            if self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                raise ValueError(
+                    "Cannot use chat template functions because tokenizer.chat_template is not set and no template "
+                    "argument was passed! For information about writing templates and setting the "
+                    "tokenizer.chat_template attribute, please see the documentation at "
+                    "https://huggingface.co/docs/transformers/main/en/chat_templating"
+                )
 
-model_inputs = tokenizer(src_texts, ...)
-labels = tokenizer(text_target=tgt_texts, ...)
-model_inputs["labels"] = labels["input_ids"]
+        return chat_template
 
-See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
-For a more complete example, see the implementation of `prepare_seq2seq_batch`.
-"""
-        warnings.warn(formatted_warning, FutureWarning)
-        # mBART-specific kwargs that should be ignored by other models.
-        kwargs.pop("src_lang", None)
-        kwargs.pop("tgt_lang", None)
-        if max_length is None:
-            max_length = self.model_max_length
-        model_inputs = self(
-            src_texts,
-            add_special_tokens=True,
-            return_tensors=return_tensors,
-            max_length=max_length,
-            padding=padding,
-            truncation=truncation,
-            **kwargs,
+    def save_chat_templates(
+        self,
+        save_directory: Union[str, os.PathLike],
+        tokenizer_config: dict,
+        filename_prefix: Optional[str],
+        save_jinja_files: bool,
+    ):
+        """
+        Writes chat templates out to the save directory if we're using the new format, and removes them from
+        the tokenizer config if present. If we're using the legacy format, it doesn't write any files, and instead
+        writes the templates to the tokenizer config in the correct format.
+        """
+        chat_template_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE
         )
-        if tgt_texts is None:
-            return model_inputs
-        # Process tgt_texts
-        if max_target_length is None:
-            max_target_length = max_length
-        with self.as_target_tokenizer():
-            labels = self(
-                tgt_texts,
-                add_special_tokens=True,
-                return_tensors=return_tensors,
-                padding=padding,
-                max_length=max_target_length,
-                truncation=truncation,
-                **kwargs,
-            )
-        model_inputs["labels"] = labels["input_ids"]
-        return model_inputs
+        chat_template_dir = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_DIR
+        )
+
+        saved_raw_chat_template_files = []
+        if save_jinja_files and isinstance(self.chat_template, str):
+            # New format for single templates is to save them as chat_template.jinja
+            with open(chat_template_file, "w", encoding="utf-8") as f:
+                f.write(self.chat_template)
+            logger.info(f"chat template saved in {chat_template_file}")
+            saved_raw_chat_template_files.append(chat_template_file)
+            if "chat_template" in tokenizer_config:
+                tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
+        elif save_jinja_files and isinstance(self.chat_template, dict):
+            # New format for multiple templates is to save the default as chat_template.jinja
+            # and the other templates in the chat_templates/ directory
+            for template_name, template in self.chat_template.items():
+                if template_name == "default":
+                    with open(chat_template_file, "w", encoding="utf-8") as f:
+                        f.write(self.chat_template["default"])
+                    logger.info(f"chat template saved in {chat_template_file}")
+                    saved_raw_chat_template_files.append(chat_template_file)
+                else:
+                    Path(chat_template_dir).mkdir(exist_ok=True)
+                    template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja")
+                    with open(template_filepath, "w", encoding="utf-8") as f:
+                        f.write(template)
+                    logger.info(f"chat template saved in {template_filepath}")
+                    saved_raw_chat_template_files.append(template_filepath)
+            if "chat_template" in tokenizer_config:
+                tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
+        elif isinstance(self.chat_template, dict):
+            # Legacy format for multiple templates:
+            # chat template dicts are saved to the config as lists of dicts with fixed key names.
+            tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
+        elif self.chat_template is not None:
+            # Legacy format for single templates: Just make them a key in tokenizer_config.json
+            tokenizer_config["chat_template"] = self.chat_template
+        return tokenizer_config, saved_raw_chat_template_files
 
 
 def get_fast_tokenizer_file(tokenization_files: list[str]) -> str:
@@ -4358,9 +3601,197 @@ def get_fast_tokenizer_file(tokenization_files: list[str]) -> str:
     return tokenizer_file
 
 
+# Shared helper to locate a SentencePiece model file for a repo/path
+def find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
+    """
+    Find any .model file (SentencePiece model) in the model directory or Hub repo.
+
+    Tries known filenames first ("tokenizer.model", "spm.model"), then scans local dir,
+    and as a last resort lists files on the Hub to find any .model.
+
+    Returns the filename (str) relative to the repo root or directory if found, else None.
+    """
+    from .utils.hub import has_file
+
+    # Try common names first
+    for candidate in ("tokenizer.model", "spm.model"):
+        try:
+            if has_file(
+                pretrained_model_name_or_path,
+                candidate,
+                revision=kwargs.get("revision"),
+                token=kwargs.get("token"),
+                cache_dir=kwargs.get("cache_dir"),
+                local_files_only=kwargs.get("local_files_only", False),
+            ):
+                return candidate
+        except Exception:
+            pass
+
+    subfolder = kwargs.get("subfolder", "")
+    local_files_only = kwargs.get("local_files_only", False)
+
+    # Local directory scan
+    if os.path.isdir(pretrained_model_name_or_path):
+        dir_path = (
+            os.path.join(pretrained_model_name_or_path, subfolder) if subfolder else pretrained_model_name_or_path
+        )
+        if os.path.isdir(dir_path):
+            for filename in os.listdir(dir_path):
+                if filename.endswith(".model"):
+                    return filename if not subfolder else os.path.join(subfolder, filename)
+
+    # Hub listing if allowed
+    if not local_files_only:
+        try:
+            from huggingface_hub import list_repo_tree
+
+            entries = list_repo_tree(
+                repo_id=pretrained_model_name_or_path,
+                revision=kwargs.get("revision"),
+                path_in_repo=subfolder if subfolder else None,
+                recursive=False,
+                token=kwargs.get("token"),
+            )
+            for entry in entries:
+                if entry.path.endswith(".model"):
+                    return entry.path if not subfolder else entry.path.removeprefix(f"{subfolder}/")
+        except Exception:
+            pass
+
+    return None
+
+
+def load_vocab_and_merges(pretrained_model_name_or_path, **kwargs):
+    """
+    Resolve and load tokenizer vocabulary files from a repo/path.
+
+    Priority order:
+    1. Load ``vocab.json`` (WordLevel/WordPiece/BPE fast tokenizers)
+    2. Load ``vocab.txt`` when only a WordPiece vocab is available
+    3. Optionally load ``merges.txt`` (BPE tokenizers)
+
+    Returns:
+        tuple (vocab: dict|None, merges: list[tuple[str,str]]|None, files_loaded: list[str])
+    """
+    files_loaded = []
+    vocab = None
+    merges = None
+    try:
+        resolved_vocab_file = cached_file(
+            pretrained_model_name_or_path,
+            "vocab.json",
+            cache_dir=kwargs.get("cache_dir"),
+            force_download=kwargs.get("force_download", False),
+            proxies=kwargs.get("proxies"),
+            token=kwargs.get("token"),
+            revision=kwargs.get("revision"),
+            local_files_only=kwargs.get("local_files_only", False),
+            subfolder=kwargs.get("subfolder", ""),
+        )
+    except Exception:
+        resolved_vocab_file = None
+
+    if resolved_vocab_file is not None:
+        try:
+            with open(resolved_vocab_file, "r", encoding="utf-8") as vf:
+                vocab = json.load(vf)
+            files_loaded.append("vocab.json")
+        except Exception:
+            vocab = None
+
+    # Fallback to vocab.txt (WordPiece-style vocabularies)
+    if vocab is None:
+        try:
+            resolved_vocab_txt = cached_file(
+                pretrained_model_name_or_path,
+                "vocab.txt",
+                cache_dir=kwargs.get("cache_dir"),
+                force_download=kwargs.get("force_download", False),
+                proxies=kwargs.get("proxies"),
+                token=kwargs.get("token"),
+                revision=kwargs.get("revision"),
+                local_files_only=kwargs.get("local_files_only", False),
+                subfolder=kwargs.get("subfolder", ""),
+            )
+        except Exception:
+            resolved_vocab_txt = None
+
+        if resolved_vocab_txt is not None:
+            try:
+                vocab = OrderedDict()
+                with open(resolved_vocab_txt, "r", encoding="utf-8") as vf:
+                    for index, token in enumerate(vf):
+                        token = token.rstrip("\n")
+                        vocab[token] = index
+                files_loaded.append("vocab.txt")
+            except Exception:
+                vocab = None
+
+    try:
+        resolved_merges_file = cached_file(
+            pretrained_model_name_or_path,
+            "merges.txt",
+            cache_dir=kwargs.get("cache_dir"),
+            force_download=kwargs.get("force_download", False),
+            proxies=kwargs.get("proxies"),
+            token=kwargs.get("token"),
+            revision=kwargs.get("revision"),
+            local_files_only=kwargs.get("local_files_only", False),
+            subfolder=kwargs.get("subfolder", ""),
+        )
+    except Exception:
+        resolved_merges_file = None
+
+    if resolved_merges_file is not None:
+        try:
+            merges = []
+            with open(resolved_merges_file, "r", encoding="utf-8") as mf:
+                for line in mf:
+                    line = line.strip()
+                    if line and not line.startswith("#"):
+                        parts = line.split()
+                        if len(parts) == 2:
+                            merges.append((parts[0], parts[1]))
+            files_loaded.append("merges.txt")
+        except Exception:
+            merges = None
+
+    return vocab, merges, files_loaded
+
+
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub)
 if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None:
     PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format(
         object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files"
     )
+
+
+def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
+    if add_prefix_space:
+        prepend_scheme = "always"
+        if not getattr(original_tokenizer, "legacy", True):
+            prepend_scheme = "first"
+    else:
+        prepend_scheme = "never"
+    return prepend_scheme
+
+
+def generate_merges(vocab, vocab_scores: Optional[dict[str, float]] = None):
+    reverse = vocab_scores is not None
+    vocab_scores = dict(vocab_scores) if reverse else vocab
+
+    merges = []
+    for merge, piece_score in vocab_scores.items():
+        local = []
+        for index in range(1, len(merge)):
+            piece_l, piece_r = merge[:index], merge[index:]
+            if piece_l in vocab and piece_r in vocab:
+                local.append((piece_l, piece_r, piece_score))
+        local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
+        merges.extend(local)
+
+    merges = sorted(merges, key=lambda val: (val[2], len(val[0]), len(val[1])), reverse=reverse)
+    merges = [(val[0], val[1]) for val in merges]
+    return merges
diff --git a/src/transformers/tokenization_utils_sentencepiece.py b/src/transformers/tokenization_utils_sentencepiece.py
new file mode 100644
index 000000000000..4f56ec77a1d6
--- /dev/null
+++ b/src/transformers/tokenization_utils_sentencepiece.py
@@ -0,0 +1,316 @@
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+SentencePiece-based tokenization class for loading from sentencepiece.model files.
+"""
+
+import os
+from shutil import copyfile
+from typing import Optional, Union
+
+
+try:
+    import sentencepiece as spm
+except ImportError:
+    spm = None
+
+from .convert_slow_tokenizer import import_protobuf
+from .tokenization_python import PreTrainedTokenizer
+from .tokenization_utils_base import (
+    INIT_TOKENIZER_DOCSTRING,
+    AddedToken,
+    generate_merges,
+)
+from .utils import add_end_docstrings, logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+SPIECE_UNDERLINE = "▁"
+
+
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class SentencePieceBackend(PreTrainedTokenizer):
+    """
+    Base class for SentencePiece-based tokenizers that load from sentencepiece.model files.
+
+    Inherits from [`~tokenization_utils.PreTrainedTokenizer`].
+
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
+    pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(self, **kwargs):
+        # Ensure optional dependency is available before loading
+        requires_backends(self, "sentencepiece")
+
+        # Extract sentencepiece-specific parameters
+        self.vocab_file = kwargs.get("vocab_file")
+        self.legacy = kwargs.get("legacy", True)
+        self.sp_model_kwargs = kwargs.pop("sp_model_kwargs", {})
+
+        # Set backend to "sentencepiece" if not already set
+        if "backend" not in kwargs:
+            kwargs["backend"] = "sentencepiece"
+
+        # Load the SentencePiece model before calling parent __init__
+        # This is needed because parent __init__ may call methods that depend on sp_model
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        tokenizer.Load(self.vocab_file)
+
+        if not self.legacy:
+            model_pb2 = import_protobuf()
+            proto = model_pb2.ModelProto.FromString(tokenizer.serialized_model_proto())
+            if proto.normalizer_spec.add_dummy_prefix:
+                proto.normalizer_spec.add_dummy_prefix = False
+                tokenizer.LoadFromSerializedProto(proto.SerializeToString())
+
+        self.sp_model = tokenizer
+
+        # Initialize total_vocab_size before parent __init__ (which may call _add_tokens -> len(self))
+        self.total_vocab_size = self.sp_model.get_piece_size()
+
+        # Add sp_model_kwargs back to kwargs so it gets stored in init_kwargs
+        kwargs["sp_model_kwargs"] = self.sp_model_kwargs
+
+        # Call parent class __init__ (PreTrainedTokenizer)
+        # This handles tokens_trie, _added_tokens_decoder, _added_tokens_encoder,
+        # token_type_ids_pattern, special_tokens_pattern, and adds special tokens
+        super().__init__(**kwargs)
+        self._update_trie()
+
+    @property
+    def vocab_size(self) -> int:
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_tokens: bool = False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the
+        vocab which is why they have to be handled specifically.
+
+        Args:
+            new_tokens (`list[str]`or `list[tokenizers.AddedToken]`):
+                Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary
+                (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part
+                of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the
+                stripping and normalization of this token. This is NOT possible in `tokenizers`.
+            special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the tokens should be added as special tokens.
+
+        Returns:
+            `int`: The number of tokens actually added to the vocabulary.
+
+        Examples:
+
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        model = BertModel.from_pretrained("google-bert/bert-base-uncased")
+
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
+        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
+        if not new_tokens:
+            return 0
+
+        next_index = len(self)  # total size (base + added)
+        num_added = 0
+        for token in new_tokens:
+            if not isinstance(token, (str, AddedToken)):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if str(token) == "":
+                continue
+            if isinstance(token, str):
+                if token in self._added_tokens_encoder:
+                    continue
+                is_special = token in self.all_special_tokens or special_tokens
+                token = AddedToken(token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special)
+            elif special_tokens:
+                # doing token.special=True changes the normalization! will fix in rust
+                # this is important and the only reason why the AddedTokens in each class are normalized by default
+                token.__setstate__({"special": True, "normalized": token.normalized})
+
+            if token in self._added_tokens_decoder.values():
+                continue
+            if not token.special and token.normalized and getattr(self, "do_lower_case", False):
+                token.content = token.content.lower()
+
+            # Check if token already exists in the SentencePiece base vocab
+            tok_id = self.sp_model.piece_to_id(token.content)
+            in_base_vocab = (
+                tok_id < self.sp_model.get_piece_size() and self.sp_model.IdToPiece(tok_id) == token.content
+            )
+
+            if in_base_vocab:
+                token_index = tok_id
+            else:
+                token_index = next_index
+                next_index += 1
+                num_added += 1
+
+            if token.special and str(token) not in self.all_special_tokens:
+                self._extra_special_tokens.append(token)
+            # the setter automatically updates the reverse map
+            self._added_tokens_decoder[token_index] = token
+            self._added_tokens_encoder[token.content] = token_index
+            if self.verbose:
+                logger.info(f"Adding {token} to the vocabulary")
+
+        self._update_trie()
+        self._update_total_vocab_size()
+        return num_added
+
+    def _update_trie(self, unique_no_split_tokens: Optional[list[str]] = None):
+        # Add all added tokens
+        for token in self._added_tokens_decoder.values():
+            if token.content not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token.content)
+        # Also add all special tokens (even if they're in base vocab) so they get split during tokenization
+        for token in self.all_special_tokens:
+            if token not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token)
+        # Add any additional no-split tokens
+        for token in unique_no_split_tokens or []:
+            if token not in self.tokens_trie._tokens:
+                self.tokens_trie.add(token)
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = ""` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode(" Hey", out_type = str)[4:]`.
+        """
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return self.sp_model.encode(text, out_type=str)
+
+        # 1. Encode string + prefix ex: " Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        unk_token_length = len(self.sp_model.encode(str(self.unk_token)))
+        return tokens[unk_token_length:] if len(tokens) >= unk_token_length else tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) to an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Save the sentencepiece vocabulary (copy original file) to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def _decode(
+        self,
+        token_ids: Union[int, list[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = False,
+        **kwargs,
+    ) -> str:
+        """
+        Decode token ids to string.
+
+        Uses the generic decode path from PreTrainedTokenizer which works for all vocabularies,
+        including custom vocabularies that override _convert_id_to_token.
+        """
+        # Use parent class's generic decode method - it's simpler and works for all cases
+        return super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+
+class SentencePieceExtractor:
+    """
+    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
+    """
+
+    def __init__(self, model: str):
+        requires_backends(self, "sentencepiece")
+        from sentencepiece import SentencePieceProcessor
+
+        self.sp = SentencePieceProcessor()
+        self.sp.Load(model)
+
+    def extract(self, vocab_scores=None) -> tuple[dict[str, int], list[tuple[str, float]], list[tuple]]:
+        """
+        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
+        order the merges with respect to the piece scores instead.
+        """
+        sp = self.sp
+        vocab_ids = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
+
+        vocab_scores_dict = {sp.id_to_piece(i): sp.get_score(i) for i in range(sp.GetPieceSize())}
+
+        merges = generate_merges(vocab_ids, vocab_scores_dict)
+
+        vocab_scores_list = [(sp.id_to_piece(i), sp.get_score(i)) for i in range(sp.GetPieceSize())]
+
+        return vocab_ids, vocab_scores_list, merges
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_tokenizers.py
similarity index 59%
rename from src/transformers/tokenization_utils_fast.py
rename to src/transformers/tokenization_utils_tokenizers.py
index 7907a2aa0cb9..68b1d59bd259 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_tokenizers.py
@@ -20,29 +20,26 @@
 import json
 import os
 from collections import defaultdict
-from collections.abc import Iterable
-from typing import Any
+from shutil import copyfile
+from typing import Any, Optional, Union
 
 import tokenizers.pre_tokenizers as pre_tokenizers_fast
+from tokenizers import AddedToken, processors
 from tokenizers import Encoding as EncodingFast
 from tokenizers import Tokenizer as TokenizerFast
+from tokenizers import normalizers as tokenizers_normalizers
 from tokenizers.decoders import Decoder as DecoderFast
 from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer
 
 from .convert_slow_tokenizer import convert_slow_tokenizer
 from .integrations.ggml import convert_gguf_tokenizer
 from .modeling_gguf_pytorch_utils import load_gguf_checkpoint
-from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_utils_base import (
     INIT_TOKENIZER_DOCSTRING,
-    AddedToken,
     BatchEncoding,
     PreTokenizedInput,
-    PreTokenizedInputPair,
     PreTrainedTokenizerBase,
-    SpecialTokensMixin,
     TextInput,
-    TextInputPair,
     TruncationStrategy,
 )
 from .utils import PaddingStrategy, add_end_docstrings, logging
@@ -79,7 +76,7 @@
 
 
 @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
-class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
+class TokenizersBackend(PreTrainedTokenizerBase):
     """
     Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
 
@@ -93,7 +90,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
-    slow_tokenizer_class: type[PreTrainedTokenizer] | None = None
 
     def __init__(self, *args, **kwargs):
         tokenizer_object = kwargs.pop("tokenizer_object", None)
@@ -101,14 +97,10 @@ def __init__(self, *args, **kwargs):
         gguf_file = kwargs.pop("gguf_file", None)
         fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
         from_slow = kwargs.pop("from_slow", False)
-        added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
-        self.add_prefix_space = kwargs.get("add_prefix_space", False)
-
-        if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
-            raise ValueError(
-                "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
-                "have sentencepiece installed."
-            )
+        # Note: added_tokens_decoder is NOT popped - it's passed to super().__init__() for processing
+        added_tokens_decoder = kwargs.get("added_tokens_decoder", {})
+        # Store add_prefix_space before super().__init__() to ensure it's not overridden
+        add_prefix_space = kwargs.get("add_prefix_space", False)
 
         if tokenizer_object is not None:
             fast_tokenizer = copy.deepcopy(tokenizer_object)
@@ -135,7 +127,8 @@ def __init__(self, *args, **kwargs):
         elif not slow_tokenizer:
             # We tried loading a slow_tokenizer with spm and failed, try to load with tiktoken
             self.vocab_file = kwargs.get("vocab_file")
-            self.additional_special_tokens = kwargs.get("additional_special_tokens", [])
+            # V5: Set _extra_special_tokens directly for converter
+            self._extra_special_tokens = kwargs.get("extra_special_tokens", [])
             fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True)
             slow_tokenizer = None
         else:
@@ -152,8 +145,6 @@ def __init__(self, *args, **kwargs):
         if slow_tokenizer is not None:
             kwargs.update(slow_tokenizer.init_kwargs)
 
-        self._decode_use_source_tokenizer = False
-
         _truncation = self._tokenizer.truncation
 
         if _truncation is not None:
@@ -174,8 +165,14 @@ def __init__(self, *args, **kwargs):
             kwargs.setdefault("max_length", _padding["length"])
             kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"])
 
+        # Set backend to "tokenizers" if not already set
+        if "backend" not in kwargs:
+            kwargs["backend"] = "tokenizers"
+
         # We call this after having initialized the backend tokenizer because we update it.
         super().__init__(**kwargs)
+        # Ensure add_prefix_space is set correctly after parent init
+        self.add_prefix_space = add_prefix_space
         self._tokenizer.encode_special_tokens = self.split_special_tokens
 
         added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder}
@@ -185,39 +182,50 @@ def __init__(self, *args, **kwargs):
             if hash(repr(token)) not in added_tokens_decoder_hash
         ]
         encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add]
-        # if some of the special tokens are strings, we check if we don't already have a token
-        tokens_to_add += [
-            token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add
-        ]
+        # if some of the special tokens are not already in the tokenizer, add them
+        # V5: Check both named special tokens and extra special tokens
+        # Iterate over _special_tokens_map to preserve AddedToken properties (lstrip, rstrip, etc.)
+        for special_token_value in self._special_tokens_map.values():
+            if special_token_value is None:
+                continue
+            if str(special_token_value) not in encoder and special_token_value not in tokens_to_add:
+                tokens_to_add.append(special_token_value)
+
+        # Also check extra special tokens
+        for token in self._extra_special_tokens:
+            if str(token) not in encoder and token not in tokens_to_add:
+                tokens_to_add.append(token)
 
         if len(tokens_to_add) > 0:
             tokens = []
-            special_tokens = self.all_special_tokens
+            all_named_tokens = [str(t) for t in self._special_tokens_map.values() if t]
             for token in tokens_to_add:
-                is_special = (
-                    (token.special or str(token) in special_tokens)
-                    if isinstance(token, AddedToken)
-                    else str(token) in special_tokens
-                )
                 if isinstance(token, str):
-                    token = AddedToken(token, special=is_special)
-                else:
-                    token.special = is_special
+                    # Convert string to AddedToken, assuming it's special
+                    token = AddedToken(token, special=True)
+                elif isinstance(token, AddedToken):
+                    # Ensure the special flag is set correctly for special tokens
+                    if not token.special and str(token) in all_named_tokens:
+                        token.special = True
                 tokens.append(token)
             if tokens:
-                self.add_tokens(tokens)
+                # These tokens are from the special tokens map
+                self.add_tokens(tokens, special_tokens=True)
 
         try:
-            pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
-            if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space:
-                pre_tok_class = getattr(pre_tokenizers_fast, pre_tok_state.pop("type"))
-                pre_tok_state["add_prefix_space"] = self.add_prefix_space
-                self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
-        except Exception:
-            # We'll get an error if there is no pre_tokenizer, or if it's a custom pre_tokenizer that can
-            # not be serialized. In those cases, we just ignore the error as there's no pre_tokenizer
-            # for which we need to update the `add_prefix_space` attribute.
-            pass
+            vocab_size = self._tokenizer.get_vocab_size()
+        except NotImplementedError:
+            vocab_size = 0
+
+        # Optionally patches mistral tokenizers with wrong regex
+        if vocab_size > 100000 and getattr(self._tokenizer, "pre_tokenizer", None) is not None:
+            self._tokenizer = self._patch_mistral_regex(
+                self._tokenizer,
+                self.init_kwargs.get("name_or_path", None),
+                init_kwargs=self.init_kwargs,
+                fix_mistral_regex=kwargs.get("fix_mistral_regex"),
+                **kwargs,
+            )
 
     @property
     def is_fast(self) -> bool:
@@ -237,6 +245,172 @@ def can_save_slow_tokenizer(self) -> bool:
         else:
             return True
 
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    def update_post_processor(self):
+        """
+        Updates the underlying post processor with the current `bos_token` and `eos_token`.
+        """
+        bos = self.bos_token
+        bos_token_id = self.bos_token_id
+        if bos is None and self.add_bos_token:
+            raise ValueError("add_bos_token = True but bos_token = None")
+
+        eos = self.eos_token
+        eos_token_id = self.eos_token_id
+        # If eos_token is None and add_eos_token is True, silently disable add_eos_token
+        # This allows tokenizers to set add_eos_token even if eos_token is not configured
+        if eos is None and self.add_eos_token:
+            self._add_eos_token = False
+            return
+
+        single = f"{(bos + ':0 ') if self.add_bos_token else ''}$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
+        pair = f"{single}{(' ' + bos + ':1') if self.add_bos_token else ''} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
+
+        special_tokens = []
+        if self.add_bos_token:
+            special_tokens.append((bos, bos_token_id))
+        if self.add_eos_token:
+            special_tokens.append((eos, eos_token_id))
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=single, pair=pair, special_tokens=special_tokens
+        )
+
+    @property
+    def add_eos_token(self):
+        return getattr(self, "_add_eos_token", False)
+
+    @property
+    def add_bos_token(self):
+        return getattr(self, "_add_bos_token", False)
+
+    @add_eos_token.setter
+    def add_eos_token(self, value):
+        object.__setattr__(self, "_add_eos_token", value)
+        self.update_post_processor()
+
+    @add_bos_token.setter
+    def add_bos_token(self, value):
+        object.__setattr__(self, "_add_bos_token", value)
+        self.update_post_processor()
+
+    def _post_init(self):
+        """
+        Post-initialization hook that runs after the tokenizer is fully set up.
+        This is called by from_pretrained() after loading the tokenizer, which allows
+        us to add any special tokens that may have been passed as AddedToken objects.
+
+        Child classes should call super()._post_init() if they override this method.
+        """
+        tokens_to_add = []
+        # V5: Check named special tokens
+        for token_value in self._special_tokens_map.values():
+            if token_value is None:
+                continue
+            if isinstance(token_value, AddedToken):
+                if self._tokenizer.token_to_id(str(token_value)) is None:
+                    tokens_to_add.append(token_value)
+            elif isinstance(token_value, str):
+                if self._tokenizer.token_to_id(token_value) is None:
+                    tokens_to_add.append(AddedToken(token_value, special=True, normalized=False))
+
+        # V5: Check extra special tokens
+        for token in self._extra_special_tokens:
+            if isinstance(token, AddedToken):
+                if self._tokenizer.token_to_id(str(token)) is None:
+                    tokens_to_add.append(token)
+            elif isinstance(token, str):
+                if self._tokenizer.token_to_id(token) is None:
+                    tokens_to_add.append(AddedToken(token, special=True, normalized=False))
+
+        if tokens_to_add:
+            # Ensure special tokens are added as such to the backend
+            self.add_tokens(tokens_to_add, special_tokens=True)
+
+        if hasattr(self, "_add_bos_token") or hasattr(self, "_add_eos_token"):
+            self.update_post_processor()
+
+        # Update add_prefix_space in the pre_tokenizer if needed
+        if hasattr(self, "add_prefix_space"):
+            try:
+                tokenizer_json = json.loads(self.backend_tokenizer.to_str())
+                pre_tok = tokenizer_json.get("pre_tokenizer", {})
+
+                # Recursively update add_prefix_space in pretokenizers
+                def update_add_prefix_space(pretok_dict, value):
+                    updated = False
+                    if pretok_dict.get("type") == "Sequence":
+                        for nested in pretok_dict.get("pretokenizers", []):
+                            updated |= update_add_prefix_space(nested, value)
+                    elif "add_prefix_space" in pretok_dict and pretok_dict["add_prefix_space"] != value:
+                        pretok_dict["add_prefix_space"] = value
+                        updated = True
+                    return updated
+
+                if update_add_prefix_space(pre_tok, self.add_prefix_space):
+                    self._tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json))
+            except Exception:
+                pass
+
+        # Ensure normalizer flags (lowercase/accents/chinese chars) reflect tokenizer attributes
+        try:
+            normalizer = self.backend_tokenizer.normalizer
+            if normalizer is not None:
+                norm_state = json.loads(normalizer.__getstate__())
+                norm_type = norm_state.get("type")
+
+                desired_lowercase = getattr(self, "do_lower_case", None)
+                desired_strip_accents = getattr(self, "strip_accents", None)
+                # Some tokenizers expose keep_accents instead of strip_accents
+                if desired_strip_accents is None and hasattr(self, "keep_accents") and "strip_accents" in norm_state:
+                    keep_accents_value = getattr(self, "keep_accents")
+                    if keep_accents_value is not None:
+                        desired_strip_accents = not keep_accents_value
+                desired_handle_chinese = getattr(self, "tokenize_chinese_chars", None)
+
+                updated = False
+                if (
+                    desired_lowercase is not None
+                    and "lowercase" in norm_state
+                    and norm_state["lowercase"] != desired_lowercase
+                ):
+                    norm_state["lowercase"] = desired_lowercase
+                    updated = True
+                if (
+                    desired_strip_accents is not None
+                    and "strip_accents" in norm_state
+                    and norm_state["strip_accents"] != desired_strip_accents
+                ):
+                    norm_state["strip_accents"] = desired_strip_accents
+                    updated = True
+                if (
+                    desired_handle_chinese is not None
+                    and "handle_chinese_chars" in norm_state
+                    and norm_state["handle_chinese_chars"] != desired_handle_chinese
+                ):
+                    norm_state["handle_chinese_chars"] = desired_handle_chinese
+                    updated = True
+
+                if updated and norm_type is not None:
+                    norm_class = getattr(tokenizers_normalizers, norm_type, None)
+                    if norm_class is not None:
+                        norm_state.pop("type", None)
+                        self.backend_tokenizer.normalizer = norm_class(**norm_state)
+        except Exception:
+            # Best-effort: do not block initialization on normalizer reconciliation
+            pass
+
     @property
     def vocab_size(self) -> int:
         """
@@ -351,22 +525,6 @@ def _convert_encoding(
 
         return encoding_dict, encodings
 
-    def convert_tokens_to_ids(self, tokens: str | Iterable[str]) -> int | list[int]:
-        """
-        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
-        vocabulary.
-
-        Args:
-            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).
-
-        Returns:
-            `int` or `list[int]`: The token id or list of token ids.
-        """
-        if isinstance(tokens, str):
-            return self._convert_token_to_id_with_added_voc(tokens)
-
-        return [self._convert_token_to_id_with_added_voc(token) for token in tokens]
-
     def _convert_token_to_id_with_added_voc(self, token: str) -> int:
         index = self._tokenizer.token_to_id(token)
         if index is None:
@@ -430,7 +588,7 @@ def convert_ids_to_tokens(self, ids: int | list[int], skip_special_tokens: bool
         return tokens
 
     def tokenize(self, text: str, pair: str | None = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
-        return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
+        return self._encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens()
 
     def set_truncation_and_padding(
         self,
@@ -507,12 +665,10 @@ def set_truncation_and_padding(
             if _padding != target:
                 self._tokenizer.enable_padding(**target)
 
-    def _batch_encode_plus(
+    def _encode_plus(
         self,
-        batch_text_or_text_pairs: list[TextInput]
-        | list[TextInputPair]
-        | list[PreTokenizedInput]
-        | list[PreTokenizedInputPair],
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         add_special_tokens: bool = True,
         padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
@@ -521,7 +677,7 @@ def _batch_encode_plus(
         is_split_into_words: bool = False,
         pad_to_multiple_of: int | None = None,
         padding_side: str | None = None,
-        return_tensors: str | None = None,
+        return_tensors: bool | None = None,
         return_token_type_ids: bool | None = None,
         return_attention_mask: bool | None = None,
         return_overflowing_tokens: bool = False,
@@ -529,14 +685,71 @@ def _batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_length: bool = False,
         verbose: bool = True,
-        split_special_tokens: bool = False,
+        split_special_tokens: Optional[bool] = None,
+        **kwargs,
     ) -> BatchEncoding:
+        # Input validation (from _call_one)
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                return True
+            elif isinstance(t, (list, tuple)):
+                if len(t) == 0:
+                    return True
+                elif isinstance(t[0], str):
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    if len(t[0]) == 0 or isinstance(t[0][0], str):
+                        return True
+                    elif isinstance(t[0][0], (list, tuple)):
+                        return len(t[0][0]) == 0 or isinstance(t[0][0][0], str)
+                    else:
+                        return False
+                else:
+                    return False
+            else:
+                return False
+
+        if not _is_valid_text_input(text):
+            raise ValueError(
+                "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
+                "or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs)."
+            )
+
+        if text_pair is not None and not _is_valid_text_input(text_pair):
+            raise ValueError(
+                "text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) "
+                "or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs)."
+            )
+
+        # Batch detection (from _call_one)
+        if is_split_into_words:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple))
+
+        if is_batched:
+            # Batch validation
+            if isinstance(text_pair, str):
+                raise TypeError(
+                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as"
+                    " `text`."
+                )
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+        else:
+            # Single input - convert to batch format
+            batch_text_or_text_pairs = [(text, text_pair)] if text_pair else [text]
+
+        # Set tokenizer configuration (from _batch_encode_plus)
         if not isinstance(batch_text_or_text_pairs, (tuple, list)):
             raise TypeError(
                 f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})"
             )
 
-        # Set the truncation and padding strategy and restore the initial configuration
         self.set_truncation_and_padding(
             padding_strategy=padding_strategy,
             truncation_strategy=truncation_strategy,
@@ -546,21 +759,21 @@ def _batch_encode_plus(
             padding_side=padding_side,
         )
 
+        # Use self.split_special_tokens as default if not explicitly provided
+        if split_special_tokens is None:
+            split_special_tokens = self.split_special_tokens
+
         if self._tokenizer.encode_special_tokens != split_special_tokens:
             self._tokenizer.encode_special_tokens = split_special_tokens
 
+        # Direct rust backend call
         encodings = self._tokenizer.encode_batch(
             batch_text_or_text_pairs,
             add_special_tokens=add_special_tokens,
             is_pretokenized=is_split_into_words,
         )
 
-        # Convert encoding to dict
-        # `Tokens` has type: tuple[
-        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
-        #                       list[EncodingFast]
-        #                    ]
-        # with nested dimensions corresponding to batch, overflows, sequence length
+        # Convert encodings to BatchEncoding format
         tokens_and_encodings = [
             self._convert_encoding(
                 encoding=encoding,
@@ -575,12 +788,7 @@ def _batch_encode_plus(
             for encoding in encodings
         ]
 
-        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
-        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
-        # (we say ~ because the number of overflow varies with the example in the batch)
-        #
-        # To match each overflowing sample with the original sample in the batch
-        # we add an overflow_to_sample_mapping array (see below)
+        # Convert the output to have dict[list] from list[dict]
         sanitized_tokens = {}
         for key in tokens_and_encodings[0][0]:
             stack = [e for item, _ in tokens_and_encodings for e in item[key]]
@@ -588,7 +796,6 @@ def _batch_encode_plus(
         sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
 
         # If returning overflowing tokens, we need to return a mapping
-        # from the batch idx to the original sample
         if return_overflowing_tokens:
             overflow_to_sample_mapping = []
             for i, (toks, _) in enumerate(tokens_and_encodings):
@@ -597,57 +804,11 @@ def _batch_encode_plus(
 
         for input_ids in sanitized_tokens["input_ids"]:
             self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
-        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
 
-    def _encode_plus(
-        self,
-        text: TextInput | PreTokenizedInput,
-        text_pair: TextInput | PreTokenizedInput | None = None,
-        add_special_tokens: bool = True,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
-        max_length: int | None = None,
-        stride: int = 0,
-        is_split_into_words: bool = False,
-        pad_to_multiple_of: int | None = None,
-        padding_side: str | None = None,
-        return_tensors: bool | None = None,
-        return_token_type_ids: bool | None = None,
-        return_attention_mask: bool | None = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        return_length: bool = False,
-        verbose: bool = True,
-        split_special_tokens: bool = False,
-        **kwargs,
-    ) -> BatchEncoding:
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_output = self._batch_encode_plus(
-            batched_input,
-            is_split_into_words=is_split_into_words,
-            add_special_tokens=add_special_tokens,
-            padding_strategy=padding_strategy,
-            truncation_strategy=truncation_strategy,
-            max_length=max_length,
-            stride=stride,
-            pad_to_multiple_of=pad_to_multiple_of,
-            padding_side=padding_side,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            return_length=return_length,
-            verbose=verbose,
-            split_special_tokens=split_special_tokens,
-            **kwargs,
-        )
+        batched_output = BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
 
-        # Return tensor is None, then we can remove the leading batch axis
-        # Overflowing tokens are returned as a batch of output so we keep them in this case
-        if return_tensors is None and not return_overflowing_tokens:
+        # If single input, remove the batch dimension (unless returning overflowing tokens)
+        if not is_batched and return_tensors is None and not return_overflowing_tokens:
             batched_output = BatchEncoding(
                 {
                     key: (value[0] if len(value) > 0 and isinstance(value[0], list) else value)
@@ -656,8 +817,6 @@ def _encode_plus(
                 batched_output.encodings,
             )
 
-        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
-
         return batched_output
 
     def convert_tokens_to_string(self, tokens: list[str]) -> str:
@@ -674,22 +833,12 @@ def _decode(
         clean_up_tokenization_spaces: bool | None = None,
         **kwargs,
     ) -> str:
-        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+        # Removed: use_source_tokenizer parameter (unused)
+        kwargs.pop("use_source_tokenizer", None)  # Pop if present to avoid errors
 
         if isinstance(token_ids, int):
             token_ids = [token_ids]
-        text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
-
-        clean_up_tokenization_spaces = (
-            clean_up_tokenization_spaces
-            if clean_up_tokenization_spaces is not None
-            else self.clean_up_tokenization_spaces
-        )
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
+        return self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
 
     def _save_pretrained(
         self,
@@ -698,45 +847,13 @@ def _save_pretrained(
         legacy_format: bool | None = None,
         filename_prefix: str | None = None,
     ) -> tuple[str, ...]:
-        """
-        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
-        file containing {config + vocab + added-tokens}.
-        """
         save_directory = str(save_directory)
 
-        if self.slow_tokenizer_class is None and legacy_format is True:
-            raise ValueError(
-                "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You"
-                " might consider leaving the legacy_format at `None` or setting it to `False`."
-            )
-
-        save_slow = (
-            (legacy_format is None or legacy_format is True)
-            and self.slow_tokenizer_class is not None
-            and self.can_save_slow_tokenizer
+        tokenizer_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
         )
-        save_fast = legacy_format is None or legacy_format is False
-
-        if save_slow:
-            added_tokens_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
-            )
-            # make sure to be forward compatible
-            added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
-            if added_vocab:
-                with open(added_tokens_file, "w", encoding="utf-8") as f:
-                    out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
-                    f.write(out_str)
-
-            vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
-            file_names = file_names + vocab_files + (added_tokens_file,)
-
-        if save_fast:
-            tokenizer_file = os.path.join(
-                save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE
-            )
-            self.backend_tokenizer.save(tokenizer_file)
-            file_names = file_names + (tokenizer_file,)
+        self.backend_tokenizer.save(tokenizer_file)
+        file_names = file_names + (tokenizer_file,)
 
         return file_names
 
@@ -889,10 +1006,8 @@ def train_new_from_iterator(
             tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json))
 
         kwargs = self.init_kwargs.copy()
-        # Map pad/cls/mask token at the Transformers level
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
-        for token in special_tokens_list:
+        # V5: Map pad/cls/mask token at the Transformers level (named tokens only)
+        for token in PreTrainedTokenizerBase.SPECIAL_TOKENS_ATTRIBUTES:
             if getattr(self, token) is not None:
                 special_token = getattr(self, token)
                 if special_tokens_map is not None and special_token in special_tokens_map:
@@ -912,10 +1027,156 @@ def train_new_from_iterator(
                 else:
                     kwargs[token] = special_token
 
-        additional_special_tokens = self.additional_special_tokens
+        # V5: Handle extra special tokens
+        extra_special_tokens = self.extra_special_tokens.copy() if self.extra_special_tokens else []
         if new_special_tokens is not None:
-            additional_special_tokens.extend(new_special_tokens)
-        if len(additional_special_tokens) > 0:
-            kwargs["additional_special_tokens"] = additional_special_tokens
+            extra_special_tokens.extend(new_special_tokens)
+        if len(extra_special_tokens) > 0:
+            kwargs["extra_special_tokens"] = extra_special_tokens
+
+        # Always try to pass tokenizer_object in kwargs first (standard TokenizersBackend usage)
+        # If the class creates its own tokenizer and passes it explicitly to super().__init__(),
+        # this will cause a TypeError, which we catch and handle by removing tokenizer_object
+        # from kwargs and setting _tokenizer directly after initialization.
+        kwargs["tokenizer_object"] = tokenizer
+        try:
+            return self.__class__(**kwargs)
+        except TypeError as e:
+            # Check if the error is due to multiple values for tokenizer_object
+            if "multiple values for keyword argument 'tokenizer_object'" in str(e):
+                # Class creates its own tokenizer and passes it explicitly (like LayoutLMv3Tokenizer)
+                # Remove tokenizer_object from kwargs and set _tokenizer directly
+                kwargs.pop("tokenizer_object", None)
+                new_tokenizer = self.__class__(**kwargs)
+                new_tokenizer._tokenizer = tokenizer
+                return new_tokenizer
+            else:
+                # Some other TypeError, re-raise it
+                raise
+
+    @classmethod
+    def _patch_mistral_regex(
+        cls,
+        tokenizer,
+        pretrained_model_name_or_path,
+        token=None,
+        cache_dir=None,
+        local_files_only=False,
+        _commit_hash=None,
+        is_local=False,
+        init_kwargs=None,
+        fix_mistral_regex=None,
+        **kwargs,
+    ):
+        """
+        Patches mistral related tokenizers with incorrect regex if detected
+            1) Local file with an associated config saved next to it
+                >> Model type one of the mistral models (on older versions)
+            2) Remote models on the hub from official mistral models
+                >> Tags including `base_model:.*mistralai`
+        """
+        import re
+
+        from huggingface_hub import model_info
+        from packaging import version
+
+        from transformers.utils.hub import cached_file
+
+        def is_base_mistral(model_id: str) -> bool:
+            model = model_info(model_id)
+            if model.tags is not None:
+                if re.search("base_model:.*mistralai", "".join(model.tags)):
+                    return True
+            return False
+
+        if pretrained_model_name_or_path is not None and (is_local or is_base_mistral(pretrained_model_name_or_path)):
+            _config_file = cached_file(
+                pretrained_model_name_or_path,
+                "config.json",
+                cache_dir=cache_dir,
+                token=token,
+                local_files_only=local_files_only,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                _commit_hash=_commit_hash,
+            )
+
+            # Detected using a (local) mistral tokenizer
+            mistral_config_detected = False
+            if _config_file is not None:
+                with open(_config_file, encoding="utf-8") as f:
+                    _config = json.load(f)
+                transformers_version = _config.get("transformers_version")
+                transformers_model_type = _config.get("model_type")
+
+                # Detect if we can skip the mistral fix by
+                #   a) having a non-mistral tokenizer
+                #   b) fixed version of transformers
+                if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
+                    if (
+                        is_local
+                        and transformers_model_type is not None
+                        and transformers_model_type
+                        not in [
+                            "mistral",
+                            "mistral3",
+                            "voxtral",
+                            "ministral",
+                            "pixtral",
+                        ]
+                    ):
+                        return tokenizer
+                elif transformers_version and version.parse(transformers_version) >= version.parse("5.0.0"):
+                    return tokenizer
+
+                mistral_config_detected = True
+
+            if mistral_config_detected or (not is_local and is_base_mistral(pretrained_model_name_or_path)):
+                # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.
+                if init_kwargs and "fix_mistral_regex" in init_kwargs:
+                    setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"])
+
+                # only warn if its not explicitly passed
+                if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False):
+                    setattr(tokenizer, "fix_mistral_regex", False)
+                    logger.warning(
+                        f"The tokenizer you are loading from '{pretrained_model_name_or_path}'"
+                        f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e."
+                        " This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue."
+                    )
+                elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False):
+                    setattr(tokenizer, "fix_mistral_regex", True)
+                    import tokenizers
+
+                    split_pretokenizer = tokenizers.pre_tokenizers.Split(
+                        pattern=tokenizers.Regex(
+                            r"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+"
+                        ),
+                        behavior="isolated",
+                    )
+                    current_pretokenizer = tokenizer.backend_tokenizer.pre_tokenizer
+                    # Check if it's already a Sequence
+                    if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Sequence):
+                        # Replace the first element (the Split pattern)
+                        tokenizer.backend_tokenizer.pre_tokenizer[0] = split_pretokenizer
+                    else:
+                        # Replace Metaspace with ByteLevel when adding Split, as Metaspace(split=False) doesn't
+                        # work correctly with the Split pre-tokenizer and causes spaces to be lost during encoding
+                        if isinstance(current_pretokenizer, tokenizers.pre_tokenizers.Metaspace):
+                            current_pretokenizer = tokenizers.pre_tokenizers.ByteLevel(
+                                add_prefix_space=False, use_regex=False
+                            )
+
+                        # Not a Sequence, so create one with Split + current pretokenizer
+                        tokenizer.backend_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence(
+                            [
+                                split_pretokenizer,
+                                current_pretokenizer,
+                            ]
+                        )
+
+        return tokenizer
+
 
-        return self.__class__(tokenizer_object=tokenizer, **kwargs)
+# Backward-compatible alias: allow referring to TokenizersBackend as PreTrainedTokenizerFast
+PreTrainedTokenizerFast = TokenizersBackend
diff --git a/src/transformers/utils/dummy_mistral_common_objects.py b/src/transformers/utils/dummy_mistral_common_objects.py
index 0c9a5a3a9db8..36cdb01cea45 100644
--- a/src/transformers/utils/dummy_mistral_common_objects.py
+++ b/src/transformers/utils/dummy_mistral_common_objects.py
@@ -2,7 +2,7 @@
 from ..utils import DummyObject, requires_backends
 
 
-class MistralCommonTokenizer(metaclass=DummyObject):
+class MistralCommonBackend(metaclass=DummyObject):
     _backends = ["mistral-common"]
 
     def __init__(self, *args, **kwargs):
diff --git a/src/transformers/utils/dummy_sentencepiece_objects.py b/src/transformers/utils/dummy_sentencepiece_objects.py
deleted file mode 100644
index 7931e0fe6584..000000000000
--- a/src/transformers/utils/dummy_sentencepiece_objects.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..utils import DummyObject, requires_backends
-
-
-class AlbertTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class BarthezTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class BartphoTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class BertGenerationTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class BigBirdTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class CamembertTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class CodeLlamaTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class CpmTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class DebertaV2Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class ErnieMTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class XLMProphetNetTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class FNetTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class GemmaTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class GPTSw3Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class LayoutXLMTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class LlamaTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class M2M100Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class MarianTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class MBartTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class MBart50Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class MLukeTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class MT5Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class NllbTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class PegasusTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class PLBartTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class ReformerTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class RemBertTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class SeamlessM4TTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class SiglipTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class Speech2TextTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class SpeechT5Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class T5Tokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class UdopTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class XGLMTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class XLMRobertaTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
-
-
-class XLNetTokenizer(metaclass=DummyObject):
-    _backends = ["sentencepiece"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["sentencepiece"])
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index ba1e35a7d14c..406ef76b8a32 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -84,7 +84,10 @@ class DownloadKwargs(TypedDict, total=False):
 
 
 def is_offline_mode():
-    return constants.HF_HUB_OFFLINE
+    # Import inside the function so test patches on `huggingface_hub.constants` are picked up.
+    from huggingface_hub import constants as hf_hub_constants
+
+    return hf_hub_constants.HF_HUB_OFFLINE
 
 
 # Determine default cache directory.
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index 4369b4d84424..e0cd8a4f5f5a 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -1941,10 +1941,91 @@ def call(self, *args, **kwargs):
             try:
                 module = self._get_module(self._class_to_module[name])
                 value = getattr(module, name)
-            except (ModuleNotFoundError, RuntimeError) as e:
-                raise ModuleNotFoundError(
-                    f"Could not import module '{name}'. Are this object's requirements defined correctly?"
-                ) from e
+            except (ModuleNotFoundError, RuntimeError, AttributeError) as e:
+                # V5: If trying to import a *TokenizerFast symbol, transparently fall back to the
+                # non-Fast symbol from the same module when available. This lets us keep only one
+                # backend tokenizer class while preserving legacy public names.
+                if name.endswith("TokenizerFast"):
+                    fallback_name = name[:-4]
+                    # Prefer importing the module that declares the fallback symbol if known
+                    try:
+                        if fallback_name in self._class_to_module:
+                            fb_module = self._get_module(self._class_to_module[fallback_name])
+                            fallback_value = getattr(fb_module, fallback_name)
+                        else:
+                            module = self._get_module(self._class_to_module[name])
+                            fallback_value = getattr(module, fallback_name)
+                        setattr(self, fallback_name, fallback_value)
+                        value = fallback_value
+                    except Exception:
+                        # If we can't find the fallback here, try converter logic as a last resort
+                        # before giving up
+                        value = None
+                        # Try converter mapping for Fast tokenizers that don't exist
+                        if value is None and name.endswith("TokenizerFast"):
+                            lookup_name = name[:-4]
+                            try:
+                                from ..convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
+
+                                if lookup_name in SLOW_TO_FAST_CONVERTERS:
+                                    converter_class = SLOW_TO_FAST_CONVERTERS[lookup_name]
+                                    converter_base_name = converter_class.__name__.replace("Converter", "")
+                                    preferred_tokenizer_name = f"{converter_base_name}Tokenizer"
+
+                                    candidate_names = [preferred_tokenizer_name]
+                                    for tokenizer_name, tokenizer_converter in SLOW_TO_FAST_CONVERTERS.items():
+                                        if tokenizer_converter is converter_class and tokenizer_name != lookup_name:
+                                            if tokenizer_name not in candidate_names:
+                                                candidate_names.append(tokenizer_name)
+
+                                    # Try to import the preferred candidate directly
+                                    import importlib
+
+                                    for candidate_name in candidate_names:
+                                        base_tokenizer_class = None
+
+                                        # Try to derive module path from tokenizer name (e.g., "AlbertTokenizer" -> "albert")
+                                        # Remove "Tokenizer" suffix and convert to lowercase
+                                        if candidate_name.endswith("Tokenizer"):
+                                            model_name = candidate_name[:-10].lower()  # Remove "Tokenizer"
+                                            module_path = f"transformers.models.{model_name}.tokenization_{model_name}"
+                                            try:
+                                                module = importlib.import_module(module_path)
+                                                base_tokenizer_class = getattr(module, candidate_name)
+                                            except Exception:
+                                                pass
+
+                                        # Fallback: try via _class_to_module
+                                        if base_tokenizer_class is None and candidate_name in self._class_to_module:
+                                            try:
+                                                alias_module = self._get_module(self._class_to_module[candidate_name])
+                                                base_tokenizer_class = getattr(alias_module, candidate_name)
+                                            except Exception:
+                                                continue
+
+                                        # If we still don't have base_tokenizer_class, skip this candidate
+                                        if base_tokenizer_class is None:
+                                            continue
+
+                                        # If we got here, we have base_tokenizer_class
+                                        value = base_tokenizer_class
+
+                                        setattr(self, candidate_name, base_tokenizer_class)
+                                        if lookup_name != candidate_name:
+                                            setattr(self, lookup_name, value)
+                                        setattr(self, name, value)
+                                        break
+                            except Exception:
+                                pass
+
+                        if value is None:
+                            raise ModuleNotFoundError(
+                                f"Could not import module '{name}'. Are this object's requirements defined correctly?"
+                            ) from e
+                else:
+                    raise ModuleNotFoundError(
+                        f"Could not import module '{name}'. Are this object's requirements defined correctly?"
+                    ) from e
 
         elif name in self._modules:
             try:
@@ -1954,10 +2035,89 @@ def call(self, *args, **kwargs):
                     f"Could not import module '{name}'. Are this object's requirements defined correctly?"
                 ) from e
         else:
+            # V5: If a *TokenizerFast symbol is requested but not present in the import structure,
+            # try to resolve to the corresponding non-Fast symbol's module if available.
+            if name.endswith("TokenizerFast"):
+                fallback_name = name[:-4]
+                if fallback_name in self._class_to_module:
+                    try:
+                        fb_module = self._get_module(self._class_to_module[fallback_name])
+                        value = getattr(fb_module, fallback_name)
+                        setattr(self, fallback_name, value)
+                        setattr(self, name, value)
+                        return value
+                    except Exception:
+                        pass
+            # V5: If a tokenizer class doesn't exist, check if it should alias to another tokenizer
+            # via the converter mapping (e.g., FNetTokenizer -> AlbertTokenizer via AlbertConverter)
             value = None
-            for key, values in self._explicit_import_shortcut.items():
-                if name in values:
-                    value = self._get_module(key)
+            if name.endswith("Tokenizer") or name.endswith("TokenizerFast"):
+                # Strip "Fast" suffix for converter lookup if present
+                lookup_name = name[:-4] if name.endswith("TokenizerFast") else name
+
+                try:
+                    # Lazy import to avoid circular dependencies
+                    from ..convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS
+
+                    # Check if this tokenizer has a converter mapping
+                    if lookup_name in SLOW_TO_FAST_CONVERTERS:
+                        converter_class = SLOW_TO_FAST_CONVERTERS[lookup_name]
+
+                        # Find which tokenizer class uses the same converter (reverse lookup)
+                        # Prefer the tokenizer that matches the converter name pattern
+                        # (e.g., AlbertConverter -> AlbertTokenizer)
+                        converter_base_name = converter_class.__name__.replace("Converter", "")
+                        preferred_tokenizer_name = f"{converter_base_name}Tokenizer"
+
+                        # Try preferred tokenizer first
+                        candidate_names = [preferred_tokenizer_name]
+                        # Then try all other tokenizers with the same converter
+                        for tokenizer_name, tokenizer_converter in SLOW_TO_FAST_CONVERTERS.items():
+                            if tokenizer_converter is converter_class and tokenizer_name != lookup_name:
+                                if tokenizer_name not in candidate_names:
+                                    candidate_names.append(tokenizer_name)
+
+                        # Try to import one of the candidate tokenizers
+                        for candidate_name in candidate_names:
+                            if candidate_name in self._class_to_module:
+                                try:
+                                    alias_module = self._get_module(self._class_to_module[candidate_name])
+                                    base_tokenizer_class = getattr(alias_module, candidate_name)
+                                    value = base_tokenizer_class
+
+                                    # Cache both names for future imports
+                                    setattr(self, candidate_name, base_tokenizer_class)
+                                    if lookup_name != candidate_name:
+                                        setattr(self, lookup_name, value)
+                                    setattr(self, name, value)
+                                    break
+                                except Exception:
+                                    # If this candidate fails, try the next one
+                                    continue
+                            else:
+                                # Candidate not in _class_to_module - might need recursive resolution
+                                # Try importing it directly to trigger lazy loading
+                                try:
+                                    # Try to get it from transformers module to trigger lazy loading
+                                    transformers_module = sys.modules.get("transformers")
+                                    if transformers_module and hasattr(transformers_module, candidate_name):
+                                        base_tokenizer_class = getattr(transformers_module, candidate_name)
+                                        value = base_tokenizer_class
+
+                                        if lookup_name != candidate_name:
+                                            setattr(self, lookup_name, value)
+                                        setattr(self, name, value)
+                                        break
+                                except Exception:
+                                    continue
+                except (ImportError, AttributeError):
+                    pass
+
+            if value is None:
+                for key, values in self._explicit_import_shortcut.items():
+                    if name in values:
+                        value = self._get_module(key)
+                        break
 
             if value is None:
                 raise AttributeError(f"module {self.__name__} has no attribute {name}")
@@ -2178,7 +2338,7 @@ def create_import_structure_from_path(module_path):
                 'configuration_albert': {'AlbertConfig'}
             },
             frozenset({'tokenizers'}): {
-                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+                'tokenization_albert_fast': {'AlbertTokenizer'}
             },
         },
         'align': {
@@ -2354,7 +2514,7 @@ def spread_import_structure(nested_import_structure):
                 'configuration_albert': {'AlbertConfig'}
             },
             frozenset({'tokenizers'}): {
-                'tokenization_albert_fast': {'AlbertTokenizerFast'}
+                'tokenization_albert_fast': {'AlbertTokenizer'}
             },
         },
         'align': {
@@ -2375,7 +2535,7 @@ def spread_import_structure(nested_import_structure):
 
     {
         frozenset({'tokenizers'}): {
-            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+            'albert.tokenization_albert_fast': {'AlbertTokenizer'}
         },
         frozenset(): {
             'albert.configuration_albert': {'AlbertConfig'},
@@ -2476,7 +2636,7 @@ def define_import_structure(module_path: str, prefix: str | None = None) -> IMPO
 
     {
         frozenset({'tokenizers'}): {
-            'albert.tokenization_albert_fast': {'AlbertTokenizerFast'}
+            'albert.tokenization_albert_fast': {'AlbertTokenizer'}
         },
         frozenset(): {
             'albert.configuration_albert': {'AlbertConfig'},
diff --git a/src/transformers/video_utils.py b/src/transformers/video_utils.py
index 49f4d74a6244..0593ef98f4db 100644
--- a/src/transformers/video_utils.py
+++ b/src/transformers/video_utils.py
@@ -594,6 +594,8 @@ def sample_indices_fn(metadata, **kwargs):
     requires_backends(read_video_torchcodec, ["torchcodec"])
     from torchcodec.decoders import VideoDecoder
 
+    # VideoDecoder expects a string for device, default to "cpu" if None
+
     decoder = VideoDecoder(
         video_path,
         # Interestingly `exact` mode takes less than approximate when we load the whole video
diff --git a/tests/generation/test_stopping_criteria.py b/tests/generation/test_stopping_criteria.py
index b9527327b71d..588eac04c963 100644
--- a/tests/generation/test_stopping_criteria.py
+++ b/tests/generation/test_stopping_criteria.py
@@ -225,7 +225,7 @@ def test_single_letter_stop_string(self):
         true_strings = ["a", "baa", "abc"]  # "abc" is a single token
         false_strings = ["abbbbbbb", "b"]  # "abbbbbbb" is split into multiple tokens
         stop_strings = ["a"]
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", add_prefix_space=False)
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokenizer.padding_side = "left"
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index d1f14ad0c9c8..898c1c2c28f0 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -4779,6 +4779,7 @@ def test_prompts(self, name, input, expected):
 
         healed_ids = completion_model.heal_tokens(input_ids, tokenizer=tokenizer)
         predicted = tokenizer.decode(healed_ids[0], skip_special_tokens=True)
+        predicted = predicted.lstrip()
 
         self.assertEqual(predicted, expected)
 
diff --git a/tests/models/albert/test_tokenization_albert.py b/tests/models/albert/test_tokenization_albert.py
index 403d3e86e640..0a7e6f4c0263 100644
--- a/tests/models/albert/test_tokenization_albert.py
+++ b/tests/models/albert/test_tokenization_albert.py
@@ -14,8 +14,9 @@
 
 import unittest
 
-from transformers import AlbertTokenizer, AlbertTokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers import AlbertTokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -28,106 +29,28 @@
 class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "albert/albert-base-v1"
     tokenizer_class = AlbertTokenizer
-    rust_tokenizer_class = AlbertTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-    test_sentencepiece_ignore_case = True
+
+    # Integration test data - expected outputs for the default input string
+    integration_expected_tokens = ['▁this', '▁is', '▁a', '▁test', '▁', '😊', '▁i', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁false', '.', '▁', '生活的真谛是', '▁hi', '▁hello', '▁hi', '▁hello', '▁hello', '▁', '<', 's', '>', '▁hi', '<', 's', '>', 'there', '▁the', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁hello', '.', '▁but', '▁i', 'rd', '▁and', '▁', 'ป', '▁i', 'rd', '▁', 'ด', '▁hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [48, 25, 21, 1289, 13, 1, 31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 4997, 9, 13, 1, 4148, 10975, 4148, 10975, 10975, 13, 1, 18, 1, 4148, 1, 18, 1, 1887, 14, 249, 3724, 378, 44, 7428, 13665, 45, 10975, 9, 47, 31, 897, 17, 13, 1, 31, 897, 13, 1, 8409, 184, 50, 42, 845]  # fmt: skip
+    integration_expected_decoded_text = "this is a test  i was born in 92000, and this is false.  hi hello hi hello hello s histhere the following string should be properly encoded: hello. but ird and  ird  hey how are you doing"
 
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
 
-        # We have a SentencePiece fixture for testing
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
+        from_pretrained_id = "albert/albert-base-v1"
 
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁this", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [48, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens, ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."]
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [31, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "."],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = AlbertTokenizer(SAMPLE_VOCAB)
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+        tokenizer = AlbertTokenizer.from_pretrained(from_pretrained_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(cls.tmpdirname)
 
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
+        # Build backend for slow tokenizer from the fast tokenizer's SentencePiece model
+        vocab_file = getattr(tokenizer, "vocab_file", None)
 
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'input_ids': [[2, 21970, 13, 5, 6092, 167, 28, 7103, 2153, 673, 8, 7028, 12051, 18, 17, 7103, 2153, 673, 8, 3515, 18684, 8, 4461, 6, 1927, 297, 8, 12060, 2607, 18, 13, 5, 4461, 15, 10538, 38, 8, 135, 15, 822, 58, 15, 993, 10363, 15, 1460, 8005, 4461, 15, 993, 255, 2328, 9, 9, 9, 6, 26, 1112, 816, 3260, 13, 5, 103, 2377, 6, 17, 1112, 816, 2782, 13, 5, 103, 10641, 6, 29, 84, 2512, 2430, 782, 18684, 2761, 19, 808, 2430, 2556, 17, 855, 1480, 9477, 4091, 128, 11712, 15, 7103, 2153, 673, 17, 24883, 9990, 9, 3], [2, 11502, 25, 1006, 20, 782, 8, 11809, 855, 1732, 19393, 18667, 37, 367, 21018, 69, 1854, 34, 11860, 19124, 27, 156, 225, 17, 193, 4141, 19, 65, 9124, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 14, 2231, 886, 2385, 17659, 84, 14, 16792, 1952, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
+        extractor = SentencePieceExtractor(vocab_file)
+        vocab_ids, vocab_scores, merges = extractor.extract()
+        tokenizer_from_vocab = AlbertTokenizer(vocab=vocab_ids, merges=merges)
+        tokenizer_from_vocab.pad_token = tokenizer_from_vocab.eos_token
 
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="albert/albert-base-v2",
-            revision="6b6560eaf5ff2e250b00c50f380c5389a9c2d82e",
-        )
+        cls.tokenizers = [tokenizer, tokenizer_from_vocab]
diff --git a/tests/models/aria/test_processing_aria.py b/tests/models/aria/test_processing_aria.py
index 0fa5143da518..64d6087415f9 100644
--- a/tests/models/aria/test_processing_aria.py
+++ b/tests/models/aria/test_processing_aria.py
@@ -112,9 +112,11 @@ def test_process_interleaved_images_prompts_no_image_splitting(self):
         inputs = processor(text=text, images=self.image1)
 
         # fmt: off
-        tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False)
+        # The processor expands <|img|> to <|img|><|img|> (image_seq_len=2) before tokenization
+        # So we need to tokenize the full expanded string to match what the processor does
+        expanded_text = self.image_token * self.image_seq_len + text_str
 
-        expected_input_ids = [[self.image_token_id] * self.image_seq_len + tokenized_sentence["input_ids"]]
+        expected_input_ids = [processor.tokenizer(expanded_text, add_special_tokens=False)["input_ids"]]
         # self.assertEqual(len(inputs["input_ids"]), len(expected_input_ids))
 
         self.assertEqual(inputs["input_ids"], expected_input_ids)
diff --git a/tests/models/auto/test_processor_auto.py b/tests/models/auto/test_processor_auto.py
index ff27cd3a8a0b..63f28d3dea9d 100644
--- a/tests/models/auto/test_processor_auto.py
+++ b/tests/models/auto/test_processor_auto.py
@@ -50,11 +50,11 @@
 from transformers.models.auto.image_processing_auto import get_image_processor_config
 from transformers.models.auto.video_processing_auto import get_video_processor_config
 from transformers.testing_utils import TOKEN, TemporaryHubRepo, get_tests_dir, is_staging_test
-from transformers.tokenization_utils import TOKENIZER_CONFIG_FILE
+from transformers.tokenization_python import TOKENIZER_CONFIG_FILE
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
 from transformers.utils import (
     FEATURE_EXTRACTOR_NAME,
     PROCESSOR_NAME,
-    is_tokenizers_available,
 )
 
 
@@ -243,18 +243,14 @@ def test_from_pretrained_dynamic_processor(self):
 
         tokenizer = processor.tokenizer
         self.assertTrue(tokenizer.special_attribute_present)
-        if is_tokenizers_available():
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
+        self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
 
-            # Test we can also load the slow version
-            new_processor = AutoProcessor.from_pretrained(
-                "hf-internal-testing/test_dynamic_processor_updated", trust_remote_code=True, use_fast=False
-            )
-            new_tokenizer = new_processor.tokenizer
-            self.assertTrue(new_tokenizer.special_attribute_present)
-            self.assertEqual(new_tokenizer.__class__.__name__, "NewTokenizer")
-        else:
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+        new_processor = AutoProcessor.from_pretrained(
+            "hf-internal-testing/test_dynamic_processor", trust_remote_code=True, use_fast=False
+        )
+        new_tokenizer = new_processor.tokenizer
+        self.assertTrue(new_tokenizer.special_attribute_present)
+        self.assertEqual(new_tokenizer.__class__.__name__, "NewTokenizerFast")
 
     def test_new_processor_registration(self):
         try:
@@ -422,7 +418,7 @@ def __init__(self, feature_extractor, tokenizer):
 
     def test_auto_processor_creates_tokenizer(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-bert")
-        self.assertEqual(processor.__class__.__name__, "BertTokenizerFast")
+        self.assertEqual(processor.__class__.__name__, "BertTokenizer")
 
     def test_auto_processor_creates_image_processor(self):
         processor = AutoProcessor.from_pretrained("hf-internal-testing/tiny-random-convnext")
@@ -526,7 +522,10 @@ def test_push_to_hub_dynamic_processor(self):
 
     def test_push_to_hub_with_chat_templates(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
-            tokenizer = LlamaTokenizer(SAMPLE_VOCAB_LLAMA, keep_accents=True)
+            # Extract vocab and merges from SentencePiece model
+            extractor = SentencePieceExtractor(SAMPLE_VOCAB_LLAMA)
+            vocab_ids, vocab_scores, merges = extractor.extract()
+            tokenizer = LlamaTokenizer(vocab=vocab_scores, merges=merges)
             image_processor = SiglipImageProcessor()
             chat_template = "default dummy template for testing purposes only"
             processor = LlavaProcessor(
diff --git a/tests/models/auto/test_tokenization_auto.py b/tests/models/auto/test_tokenization_auto.py
index ed5c38a03e73..69a8610affa9 100644
--- a/tests/models/auto/test_tokenization_auto.py
+++ b/tests/models/auto/test_tokenization_auto.py
@@ -32,13 +32,12 @@
     BertTokenizerFast,
     CTRLTokenizer,
     GPT2Tokenizer,
-    GPT2TokenizerFast,
     PreTrainedTokenizerFast,
     Qwen2Tokenizer,
     Qwen2TokenizerFast,
     Qwen3MoeConfig,
     RobertaTokenizer,
-    RobertaTokenizerFast,
+    TokenizersBackend,
     is_tokenizers_available,
     logging,
 )
@@ -80,23 +79,23 @@ def test_tokenizer_from_pretrained(self):
         for model_name in ("google-bert/bert-base-uncased", "google-bert/bert-base-cased"):
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
-            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+            self.assertIsInstance(tokenizer, (BertTokenizer))
             self.assertGreater(len(tokenizer), 0)
 
         for model_name in ["openai-community/gpt2", "openai-community/gpt2-medium"]:
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             self.assertIsNotNone(tokenizer)
-            self.assertIsInstance(tokenizer, (GPT2Tokenizer, GPT2TokenizerFast))
+            self.assertIsInstance(tokenizer, (GPT2Tokenizer))
             self.assertGreater(len(tokenizer), 0)
 
     def test_tokenizer_from_pretrained_identifier(self):
         tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+        self.assertIsInstance(tokenizer, (BertTokenizer))
         self.assertEqual(tokenizer.vocab_size, 12)
 
     def test_tokenizer_from_model_type(self):
         tokenizer = AutoTokenizer.from_pretrained(DUMMY_UNKNOWN_IDENTIFIER)
-        self.assertIsInstance(tokenizer, (RobertaTokenizer, RobertaTokenizerFast))
+        self.assertIsInstance(tokenizer, (RobertaTokenizer))
         self.assertEqual(tokenizer.vocab_size, 20)
 
     def test_tokenizer_from_tokenizer_class(self):
@@ -104,7 +103,7 @@ def test_tokenizer_from_tokenizer_class(self):
         self.assertIsInstance(config, RobertaConfig)
         # Check that tokenizer_type ≠ model_type
         tokenizer = AutoTokenizer.from_pretrained(DUMMY_DIFF_TOKENIZER_IDENTIFIER, config=config)
-        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+        self.assertIsInstance(tokenizer, (BertTokenizer))
         self.assertEqual(tokenizer.vocab_size, 12)
 
     def test_tokenizer_from_type(self):
@@ -127,14 +126,14 @@ def test_tokenizer_from_type_fast(self):
             shutil.copy("./tests/fixtures/vocab.txt", os.path.join(tmp_dir, "vocab.txt"))
 
             tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="bert")
-            self.assertIsInstance(tokenizer, BertTokenizerFast)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
 
         with tempfile.TemporaryDirectory() as tmp_dir:
             shutil.copy("./tests/fixtures/vocab.json", os.path.join(tmp_dir, "vocab.json"))
             shutil.copy("./tests/fixtures/merges.txt", os.path.join(tmp_dir, "merges.txt"))
 
             tokenizer = AutoTokenizer.from_pretrained(tmp_dir, tokenizer_type="gpt2")
-            self.assertIsInstance(tokenizer, GPT2TokenizerFast)
+            self.assertIsInstance(tokenizer, PreTrainedTokenizerFast)
 
     def test_tokenizer_from_type_incorrect_name(self):
         with pytest.raises(ValueError):
@@ -142,21 +141,18 @@ def test_tokenizer_from_type_incorrect_name(self):
 
     @require_tokenizers
     def test_tokenizer_identifier_with_correct_config(self):
-        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
+        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
             tokenizer = tokenizer_class.from_pretrained("wietsedv/bert-base-dutch-cased")
-            self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+            self.assertIsInstance(tokenizer, (BertTokenizer))
 
-            if isinstance(tokenizer, BertTokenizer):
-                self.assertEqual(tokenizer.basic_tokenizer.do_lower_case, False)
-            else:
-                self.assertEqual(tokenizer.do_lower_case, False)
+            self.assertEqual(tokenizer.do_lower_case, False)
 
             self.assertEqual(tokenizer.model_max_length, 512)
 
     @require_tokenizers
     @is_flaky()  # This one is flaky even with the new retry logic because it raises an unusual error
     def test_tokenizer_identifier_non_existent(self):
-        for tokenizer_class in [BertTokenizer, BertTokenizerFast, AutoTokenizer]:
+        for tokenizer_class in [BertTokenizer, AutoTokenizer]:
             with self.assertRaisesRegex(
                 EnvironmentError,
                 "julien-c/herlolip-not-exists is not a local folder and is not a valid model identifier",
@@ -170,12 +166,11 @@ def test_model_name_edge_cases_in_mappings(self):
         tokenizers = TOKENIZER_MAPPING.values()
         tokenizer_names = []
 
-        for slow_tok, fast_tok in tokenizers:
-            if slow_tok is not None:
-                tokenizer_names.append(slow_tok.__name__)
-
-            if fast_tok is not None:
-                tokenizer_names.append(fast_tok.__name__)
+        for tokenizer_entry in tokenizers:
+            candidates = tokenizer_entry if isinstance(tokenizer_entry, tuple) else (tokenizer_entry,)
+            for tokenizer_cls in candidates:
+                if tokenizer_cls is not None:
+                    tokenizer_names.append(tokenizer_cls.__name__)
 
         for tokenizer_name in tokenizer_names:
             # must find the right class
@@ -226,7 +221,7 @@ def test_PreTrainedTokenizerFast_from_pretrained(self):
 
     def test_auto_tokenizer_from_local_folder(self):
         tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
-        self.assertIsInstance(tokenizer, (BertTokenizer, BertTokenizerFast))
+        self.assertIsInstance(tokenizer, (BertTokenizer))
         with tempfile.TemporaryDirectory() as tmp_dir:
             tokenizer.save_pretrained(tmp_dir)
             tokenizer2 = AutoTokenizer.from_pretrained(tmp_dir)
@@ -307,7 +302,7 @@ def test_new_tokenizer_registration(self):
                 tokenizer.save_pretrained(tmp_dir)
 
                 new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir)
-                self.assertIsInstance(new_tokenizer, CustomTokenizer)
+                self.assertIsInstance(new_tokenizer, TokenizersBackend)
 
         finally:
             if "custom" in CONFIG_MAPPING._extra_content:
@@ -320,18 +315,18 @@ def test_new_tokenizer_fast_registration(self):
         try:
             AutoConfig.register("custom", CustomConfig)
 
-            # Can register in two steps
+            # Can register in two steps (fast takes precedence)
             AutoTokenizer.register(CustomConfig, slow_tokenizer_class=CustomTokenizer)
-            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, None))
+            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizer)
             AutoTokenizer.register(CustomConfig, fast_tokenizer_class=CustomTokenizerFast)
-            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
+            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
 
             del TOKENIZER_MAPPING._extra_content[CustomConfig]
             # Can register in one step
             AutoTokenizer.register(
                 CustomConfig, slow_tokenizer_class=CustomTokenizer, fast_tokenizer_class=CustomTokenizerFast
             )
-            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], (CustomTokenizer, CustomTokenizerFast))
+            self.assertEqual(TOKENIZER_MAPPING[CustomConfig], CustomTokenizerFast)
 
             # Trying to register something existing in the Transformers library will raise an error
             with self.assertRaises(ValueError):
@@ -351,7 +346,7 @@ def test_new_tokenizer_fast_registration(self):
                 self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
 
                 new_tokenizer = AutoTokenizer.from_pretrained(tmp_dir, use_fast=False)
-                self.assertIsInstance(new_tokenizer, CustomTokenizer)
+                self.assertIsInstance(new_tokenizer, CustomTokenizerFast)
 
         finally:
             if "custom" in CONFIG_MAPPING._extra_content:
@@ -393,7 +388,7 @@ def test_from_pretrained_dynamic_tokenizer(self):
                 "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
             )
             self.assertTrue(tokenizer.special_attribute_present)
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
             # Test tokenizer can be reloaded.
             with tempfile.TemporaryDirectory() as tmp_dir:
                 tokenizer.save_pretrained(tmp_dir)
@@ -405,8 +400,11 @@ def test_from_pretrained_dynamic_tokenizer(self):
                 with open(os.path.join(tmp_dir, "tokenizer_config.json"), "r") as f:
                     tokenizer_config = json.load(f)
                 # Assert we're pointing at local code and not another remote repo
-                self.assertEqual(tokenizer_config["auto_map"]["AutoTokenizer"], ["tokenization.NewTokenizer", None])
-            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizer")
+                self.assertEqual(
+                    tokenizer_config["auto_map"]["AutoTokenizer"],
+                    ["tokenization.NewTokenizer", "tokenization_fast.NewTokenizerFast"],
+                )
+            self.assertEqual(reloaded_tokenizer.__class__.__name__, "NewTokenizerFast")
             self.assertTrue(reloaded_tokenizer.special_attribute_present)
         else:
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
@@ -424,44 +422,24 @@ def test_from_pretrained_dynamic_tokenizer_conflict(self):
         class NewTokenizer(BertTokenizer):
             special_attribute_present = False
 
-        class NewTokenizerFast(BertTokenizerFast):
-            slow_tokenizer_class = NewTokenizer
-            special_attribute_present = False
-
         try:
             AutoConfig.register("custom", CustomConfig)
             AutoTokenizer.register(CustomConfig, slow_tokenizer_class=NewTokenizer)
-            AutoTokenizer.register(CustomConfig, fast_tokenizer_class=NewTokenizerFast)
             # If remote code is not set, the default is to use local
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer")
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
-            self.assertFalse(tokenizer.special_attribute_present)
             tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/test_dynamic_tokenizer", use_fast=False)
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
             self.assertFalse(tokenizer.special_attribute_present)
 
-            # If remote code is disabled, we load the local one.
-            tokenizer = AutoTokenizer.from_pretrained(
-                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False
-            )
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
-            self.assertFalse(tokenizer.special_attribute_present)
             tokenizer = AutoTokenizer.from_pretrained(
                 "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=False, use_fast=False
             )
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
             self.assertFalse(tokenizer.special_attribute_present)
 
-            # If remote is enabled, we load from the Hub
-            tokenizer = AutoTokenizer.from_pretrained(
-                "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True
-            )
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
-            self.assertTrue(tokenizer.special_attribute_present)
             tokenizer = AutoTokenizer.from_pretrained(
                 "hf-internal-testing/test_dynamic_tokenizer", trust_remote_code=True, use_fast=False
             )
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
             self.assertTrue(tokenizer.special_attribute_present)
 
         finally:
@@ -483,7 +461,7 @@ def test_from_pretrained_dynamic_tokenizer_legacy_format(self):
                 "hf-internal-testing/test_dynamic_tokenizer_legacy", trust_remote_code=True, use_fast=False
             )
             self.assertTrue(tokenizer.special_attribute_present)
-            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
+            self.assertEqual(tokenizer.__class__.__name__, "NewTokenizerFast")
         else:
             self.assertEqual(tokenizer.__class__.__name__, "NewTokenizer")
 
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
deleted file mode 100644
index 0910cf85c6d3..000000000000
--- a/tests/models/bart/test_tokenization_bart.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import unittest
-from functools import cached_property
-
-from transformers import BartTokenizer, BartTokenizerFast, BatchEncoding
-from transformers.models.roberta.tokenization_roberta import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tokenizers, require_torch
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_roberta_detectors
-
-
-@require_tokenizers
-class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "facebook/bart-base"
-    tokenizer_class = BartTokenizer
-    rust_tokenizer_class = BartTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_filter = filter_roberta_detectors
-    # from_pretrained_kwargs = {'add_prefix_space': True}
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        cls.special_tokens_map = {"unk_token": ""}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        return "lower newer", "lower newer"
-
-    @cached_property
-    def default_tokenizer(self):
-        return BartTokenizer.from_pretrained("facebook/bart-large")
-
-    @cached_property
-    def default_tokenizer_fast(self):
-        return BartTokenizerFast.from_pretrained("facebook/bart-large")
-
-    @require_torch
-    def test_prepare_batch(self):
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        expected_src_tokens = [0, 250, 251, 17818, 13, 39186, 1938, 4, 2]
-
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(src_text, max_length=len(expected_src_tokens), padding=True, return_tensors="pt")
-            self.assertIsInstance(batch, BatchEncoding)
-
-            self.assertEqual((2, 9), batch.input_ids.shape)
-            self.assertEqual((2, 9), batch.attention_mask.shape)
-            result = batch.input_ids.tolist()[0]
-            self.assertListEqual(expected_src_tokens, result)
-            # Test that special tokens are reset
-
-    @require_torch
-    def test_prepare_batch_empty_target_text(self):
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(src_text, padding=True, return_tensors="pt")
-            # check if input_ids are returned and no labels
-            self.assertIn("input_ids", batch)
-            self.assertIn("attention_mask", batch)
-            self.assertNotIn("labels", batch)
-            self.assertNotIn("decoder_attention_mask", batch)
-
-    @require_torch
-    def test_tokenizer_as_target_length(self):
-        tgt_text = [
-            "Summary of the text.",
-            "Another summary.",
-        ]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
-            self.assertEqual(32, targets["input_ids"].shape[1])
-
-    @require_torch
-    def test_prepare_batch_not_longer_than_maxlen(self):
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(
-                ["I am a small frog" * 1024, "I am a small frog"], padding=True, truncation=True, return_tensors="pt"
-            )
-            self.assertIsInstance(batch, BatchEncoding)
-            self.assertEqual(batch.input_ids.shape, (2, 1024))
-
-    @require_torch
-    def test_special_tokens(self):
-        src_text = ["A long paragraph for summarization."]
-        tgt_text = [
-            "Summary of the text.",
-        ]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            inputs = tokenizer(src_text, return_tensors="pt")
-            targets = tokenizer(text_target=tgt_text, return_tensors="pt")
-            input_ids = inputs["input_ids"]
-            labels = targets["input_ids"]
-            self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
-            self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
-            self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
-            self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
-
-    @unittest.skip
-    def test_pretokenized_inputs(self):
-        pass
-
-    def test_embedded_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                sentence = "A,  AllenNLP sentence."
-                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-
-                # token_type_ids should put 0 everywhere
-                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                # attention_mask should put 1 everywhere, so sum over length should be 1
-                self.assertEqual(
-                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
-                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
-                )
-
-                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-
-                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-
-                self.assertSequenceEqual(
-                    tokens_p_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""]
-                )
-                self.assertSequenceEqual(
-                    tokens_r_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""]
-                )
diff --git a/tests/models/barthez/test_tokenization_barthez.py b/tests/models/barthez/test_tokenization_barthez.py
index 5df4131d7399..a4290e1f9503 100644
--- a/tests/models/barthez/test_tokenization_barthez.py
+++ b/tests/models/barthez/test_tokenization_barthez.py
@@ -1,4 +1,4 @@
-# Copyright 2020 Ecole Polytechnique and HuggingFace Inc. team.
+# Copyright 2019 Hugging Face inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,103 +14,29 @@
 
 import unittest
 
-from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers import BarthezTokenizer
+from transformers.testing_utils import require_sentencepiece, require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-@require_tokenizers
 @require_sentencepiece
-@slow  # see https://github.com/huggingface/transformers/issues/11457
+@require_tokenizers
 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "moussaKam/mbarthez"
     tokenizer_class = BarthezTokenizer
-    rust_tokenizer_class = BarthezTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
+
+    integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '▁', '😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '▁', '生活的真谛是', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '▁hi', '', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '▁', 'ปี', '▁ir', 'd', '▁', 'ด', '▁Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [2078, 75, 10, 1938, 6, 3, 78, 402, 49997, 23, 387, 7648, 4, 124, 663, 75, 41564, 362, 5, 6, 3, 1739, 18324, 1739, 18324, 18324, 0, 901, 0, 1749, 451, 13564, 39363, 3354, 166, 72171, 22, 21077, 64, 12, 18324, 5, 3007, 172, 64, 124, 6, 3, 172, 64, 6, 3, 14833, 2271, 482, 329, 11028]  # fmt: skip
+    expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '▁', '', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '▁', '', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '▁hi', '', '▁there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁ir', 'd', '▁and', '▁', '', '▁ir', 'd', '▁', '', '▁Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test  I was born in 92000, and this is falsé.  Hi Hello Hi Hello Hello hi there The following string should be properly encoded: Hello. But ird and  ird  Hey how are you doing"
 
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
 
-        tokenizer = BarthezTokenizerFast.from_pretrained("moussaKam/mbarthez")
-        tokenizer.save_pretrained(cls.tmpdirname)
-        tokenizer.save_pretrained(cls.tmpdirname, legacy_format=False)
-        cls.tokenizer = tokenizer
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "")
-        self.assertEqual(len(vocab_keys), 101_122)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 101_122)
-
-    @require_torch
-    def test_prepare_batch(self):
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        expected_src_tokens = [0, 57, 3018, 70307, 91, 2]
-
-        batch = self.tokenizer(
-            src_text, max_length=len(expected_src_tokens), padding=True, truncation=True, return_tensors="pt"
-        )
-        self.assertIsInstance(batch, BatchEncoding)
+        from_pretrained_id = "moussaKam/mbarthez"
 
-        self.assertEqual((2, 6), batch.input_ids.shape)
-        self.assertEqual((2, 6), batch.attention_mask.shape)
-        result = batch.input_ids.tolist()[0]
-        self.assertListEqual(expected_src_tokens, result)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[0, 490, 14328, 4507, 354, 47, 43669, 95, 25, 78117, 20215, 19779, 190, 22, 400, 4, 35343, 80310, 603, 86, 24937, 105, 33438, 94762, 196, 39642, 7, 15, 15933, 173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10534, 87, 25, 66, 3358, 196, 55289, 8, 82961, 81, 2204, 75203, 7, 15, 763, 12956, 216, 178, 14328, 9595, 1377, 69693, 7, 448, 71021, 196, 18106, 1437, 13974, 108, 9083, 4, 49315, 7, 39, 86, 1326, 2793, 46333, 4, 448, 196, 74588, 7, 49315, 7, 39, 21, 822, 38470, 74, 21, 66723, 62480, 8, 22050, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        # moussaKam/mbarthez is a french model. So we also use french texts.
-        sequences = [
-            "Le transformeur est un modèle d'apprentissage profond introduit en 2017, "
-            "utilisé principalement dans le domaine du traitement automatique des langues (TAL).",
-            "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus "
-            "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches "
-            "telles que la traduction et la synthèse de texte.",
-        ]
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="moussaKam/mbarthez",
-            revision="c2e4ecbca5e3cd2c37fe1ac285ca4fbdf1366fb6",
-            sequences=sequences,
-        )
+        tokenizer = BarthezTokenizer.from_pretrained(from_pretrained_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.save_pretrained(cls.tmpdirname)
diff --git a/tests/models/bartpho/test_tokenization_bartpho.py b/tests/models/bartpho/test_tokenization_bartpho.py
index f78b17896cb6..e475cd4ae050 100644
--- a/tests/models/bartpho/test_tokenization_bartpho.py
+++ b/tests/models/bartpho/test_tokenization_bartpho.py
@@ -13,6 +13,7 @@
 
 
 import os
+import tempfile
 import unittest
 
 from transformers.models.bartpho.tokenization_bartpho import VOCAB_FILES_NAMES, BartphoTokenizer
@@ -33,23 +34,23 @@ class BartphoTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
+        cls.special_tokens_map = {"unk_token": ""}
+
+    @classmethod
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        """Create a fresh tokenizer for each test instead of loading from saved."""
+        kwargs.update(cls.special_tokens_map)
 
+        # Create a temporary directory for this tokenizer
+        tmpdir = tempfile.mkdtemp()
         vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        cls.special_tokens_map = {"unk_token": ""}
 
-        cls.monolingual_vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["monolingual_vocab_file"])
-        with open(cls.monolingual_vocab_file, "w", encoding="utf-8") as fp:
+        monolingual_vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["monolingual_vocab_file"])
+        with open(monolingual_vocab_file, "w", encoding="utf-8") as fp:
             fp.writelines(f"{token} {vocab_tokens[token]}\n" for token in vocab_tokens)
 
-        tokenizer = BartphoTokenizer(SAMPLE_VOCAB, cls.monolingual_vocab_file, **cls.special_tokens_map)
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return BartphoTokenizer.from_pretrained(pretrained_name, **kwargs)
+        return BartphoTokenizer(SAMPLE_VOCAB, monolingual_vocab_file, **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "This is a là test"
@@ -57,12 +58,21 @@ def get_input_output_texts(self, tokenizer):
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        tokenizer = BartphoTokenizer(SAMPLE_VOCAB, self.monolingual_vocab_file, **self.special_tokens_map)
-        text = "This is a là test"
-        bpe_tokens = "▁This ▁is ▁a ▁l à ▁t est".split()
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [4, 5, 6, 3, 3, 7, 8, 3]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        vocab = ["▁This", "▁is", "▁a", "▁t", "est"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        special_tokens_map = {"unk_token": ""}
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            monolingual_vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["monolingual_vocab_file"])
+            with open(monolingual_vocab_file, "w", encoding="utf-8") as fp:
+                fp.writelines(f"{token} {vocab_tokens[token]}\n" for token in vocab_tokens)
+            tokenizer = BartphoTokenizer(SAMPLE_VOCAB, monolingual_vocab_file, **special_tokens_map)
+
+            text = "This is a là test"
+            bpe_tokens = "▁This ▁is ▁a ▁l à ▁t est".split()
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + [tokenizer.unk_token]
+            input_bpe_tokens = [4, 5, 6, 3, 3, 7, 8, 3]
+            self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
diff --git a/tests/models/bert/test_tokenization_bert.py b/tests/models/bert/test_tokenization_bert.py
index b9d46dea3a55..e7bb3188a1af 100644
--- a/tests/models/bert/test_tokenization_bert.py
+++ b/tests/models/bert/test_tokenization_bert.py
@@ -13,331 +13,23 @@
 # limitations under the License.
 
 
-import os
 import unittest
 
-from transformers import BertTokenizerFast
 from transformers.models.bert.tokenization_bert import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
     BertTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
 )
-from transformers.testing_utils import require_tokenizers, slow
+from transformers.testing_utils import require_read_token, require_tokenizers
 
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
+from ...test_tokenization_common import TokenizerTesterMixin
 
 
+@require_read_token
 @require_tokenizers
 class BertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google-bert/bert-base-uncased"
+    from_pretrained_id = ["google-bert/bert-base-uncased"]
     tokenizer_class = BertTokenizer
-    rust_tokenizer_class = BertTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_basic_tokenizer_splits_on_punctuation(self):
-        tokenizer = BasicTokenizer()
-        text = "a\n'll !!to?'d of, can't."
-        expected = ["a", "'", "ll", "!", "!", "to", "?", "'", "d", "of", ",", "can", "'", "t", "."]
-        self.assertListEqual(tokenizer.tokenize(text), expected)
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google-bert/bert-base-uncased")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_common_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_common_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_common_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_common_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_common_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
+    integration_expected_tokens = ['[UNK]', 'is', 'a', 'test', '[UNK]', '[UNK]', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '[UNK]', '.', '生', '[UNK]', '的', '真', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '<', 's', '>', 'hi', '<', 's', '>', 'there', '[UNK]', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '[UNK]', '.', '[UNK]', 'ir', '##d', 'and', '[UNK]', 'ir', '##d', '[UNK]', '[UNK]', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_token_ids = [100, 2003, 1037, 3231, 100, 100, 2001, 2141, 1999, 6227, 8889, 2692, 1010, 1998, 2023, 2003, 100, 1012, 1910, 100, 1916, 1921, 100, 100, 100, 100, 100, 100, 100, 1026, 1055, 1028, 7632, 1026, 1055, 1028, 2045, 100, 2206, 5164, 2323, 2022, 7919, 12359, 1024, 100, 1012, 100, 20868, 2094, 1998, 100, 20868, 2094, 100, 100, 2129, 2024, 2017, 2725]  # fmt: skip
+    expected_tokens_from_ids = ['[UNK]', 'is', 'a', 'test', '[UNK]', '[UNK]', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '[UNK]', '.', '生', '[UNK]', '的', '真', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '<', 's', '>', 'hi', '<', 's', '>', 'there', '[UNK]', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '[UNK]', '.', '[UNK]', 'ir', '##d', 'and', '[UNK]', 'ir', '##d', '[UNK]', '[UNK]', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_decoded_text = "[UNK] is a test [UNK] [UNK] was born in 92000, and this is [UNK]. 生 [UNK] 的 真 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] < s > hi < s > there [UNK] following string should be properly encoded : [UNK]. [UNK] ird and [UNK] ird [UNK] [UNK] how are you doing"
diff --git a/tests/models/bert_generation/test_tokenization_bert_generation.py b/tests/models/bert_generation/test_tokenization_bert_generation.py
index c23d8a2cc38e..e1e92b4dc153 100644
--- a/tests/models/bert_generation/test_tokenization_bert_generation.py
+++ b/tests/models/bert_generation/test_tokenization_bert_generation.py
@@ -45,7 +45,7 @@ def test_convert_token_and_id(self):
         token = ""
         token_id = 1
 
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
+        self.assertEqual(self.get_tokenizer()._convert_token_to_id_with_added_voc(token), token_id)
         self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
 
     def test_get_vocab(self):
@@ -142,7 +142,7 @@ def test_tokenization_base_easy_symbols(self):
 
         self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
 
-    @slow
+    #   @slow
     def test_tokenization_base_hard_symbols(self):
         symbols = (
             'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
@@ -186,9 +186,7 @@ def test_tokenization_base_hard_symbols(self):
             427,
             916,
             508,
-            405,
-            34324,
-            497,
+            2253,
             391,
             408,
             11342,
@@ -218,8 +216,8 @@ def test_torch_encode_plus_sent_to_model(self):
         # Build sequence
         first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
         sequence = " ".join(first_ten_tokens)
-        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
-        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
+        encoded_sequence = self.big_tokenizer(sequence, return_tensors="pt", return_token_type_ids=False)
+        batch_encoded_sequence = self.big_tokenizer(
             [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
         )
 
diff --git a/tests/models/bert_japanese/test_tokenization_bert_japanese.py b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
index 66cc3f86afb5..cfbb3799e742 100644
--- a/tests/models/bert_japanese/test_tokenization_bert_japanese.py
+++ b/tests/models/bert_japanese/test_tokenization_bert_japanese.py
@@ -14,6 +14,8 @@
 
 
 import os
+import shutil
+import tempfile
 import unittest
 
 from transformers import AutoTokenizer
@@ -43,6 +45,10 @@ class BertJapaneseTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         super().setUpClass()
 
+        # Create a separate temp directory for the vocab file to avoid conflicts
+        # with files saved by the base class setUpClass (e.g., tokenizer_config.json, added_tokens.json)
+        cls.vocab_tmpdirname = tempfile.mkdtemp()
+
         vocab_tokens = [
             "[UNK]",
             "[CLS]",
@@ -71,10 +77,22 @@ def setUpClass(cls):
             "です",
         ]
 
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.vocab_file = os.path.join(cls.vocab_tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
+    @classmethod
+    def get_tokenizer(cls, pretrained_name=None, **kwargs):
+        """Override to use vocab_tmpdirname instead of tmpdirname to avoid conflicts with saved tokenizer files."""
+        pretrained_name = pretrained_name or cls.vocab_tmpdirname
+        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        if hasattr(cls, "vocab_tmpdirname"):
+            shutil.rmtree(cls.vocab_tmpdirname, ignore_errors=True)
+
     def get_input_output_texts(self, tokenizer):
         input_text = "こんにちは、世界。 \nこんばんは、世界。"
         output_text = "こんにちは 、 世界 。 こんばんは 、 世界 。"
@@ -349,15 +367,28 @@ class BertJapaneseCharacterTokenizationTest(TokenizerTesterMixin, unittest.TestC
     def setUpClass(cls):
         super().setUpClass()
 
+        # Create a separate temp directory for the vocab file to avoid conflicts
+        # with files saved by the base class setUpClass (e.g., tokenizer_config.json, added_tokens.json)
+        cls.vocab_tmpdirname = tempfile.mkdtemp()
+
         vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "こ", "ん", "に", "ち", "は", "ば", "世", "界", "、", "。"]
 
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        cls.vocab_file = os.path.join(cls.vocab_tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        if hasattr(cls, "vocab_tmpdirname"):
+            shutil.rmtree(cls.vocab_tmpdirname, ignore_errors=True)
+
+    @classmethod
     @classmethod
     def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        return BertJapaneseTokenizer.from_pretrained(cls.tmpdirname, subword_tokenizer_type="character", **kwargs)
+        """Override to use vocab_tmpdirname instead of tmpdirname to avoid conflicts with saved tokenizer files."""
+        pretrained_name = pretrained_name or cls.vocab_tmpdirname
+        return BertJapaneseTokenizer.from_pretrained(pretrained_name, subword_tokenizer_type="character", **kwargs)
 
     def get_input_output_texts(self, tokenizer):
         input_text = "こんにちは、世界。 \nこんばんは、世界。"
diff --git a/tests/models/big_bird/test_tokenization_big_bird.py b/tests/models/big_bird/test_tokenization_big_bird.py
index 35837c44b876..a12dc18fb033 100644
--- a/tests/models/big_bird/test_tokenization_big_bird.py
+++ b/tests/models/big_bird/test_tokenization_big_bird.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 
 import unittest
-from functools import cached_property
 
-from transformers import BigBirdTokenizer, BigBirdTokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, require_torch, slow
+from transformers import BigBirdTokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -31,187 +30,8 @@
 class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "google/bigbird-roberta-base"
     tokenizer_class = BigBirdTokenizer
-    rust_tokenizer_class = BigBirdTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        tokenizer = cls.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 1
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "[MASK]")
-        self.assertEqual(len(vocab_keys), 1_004)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [285, 46, 10, 170, 382],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "",
-                ".",
-            ],
-        )
-
-    @cached_property
-    def big_tokenizer(self):
-        return BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
-
-    @slow
-    def test_tokenization_base_easy_symbols(self):
-        symbols = "Hello World!"
-        original_tokenizer_encodings = [65, 18536, 2260, 101, 66]
-
-        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
-
-    @slow
-    def test_tokenization_base_hard_symbols(self):
-        symbols = (
-            'This is a very long text with a lot of weird characters, such as: . , ~ ? ( ) " [ ] ! : - . Also we will'
-            " add words that should not exist and be tokenized to , such as saoneuhaoesuth"
-        )
-        original_tokenizer_encodings = [65, 871, 419, 358, 946, 991, 2521, 452, 358, 1357, 387, 7751, 3536, 112, 985, 456, 126, 865, 938, 5400, 5734, 458, 1368, 467, 786, 2462, 5246, 1159, 633, 865, 4519, 457, 582, 852, 2557, 427, 916, 508, 405, 34324, 497, 391, 408, 11342, 1244, 385, 100, 938, 985, 456, 574, 362, 12597, 3200, 3129, 1172, 66]  # fmt: skip
-        self.assertListEqual(original_tokenizer_encodings, self.big_tokenizer.encode(symbols))
-
-    @require_torch
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import torch
-
-        from transformers import BigBirdConfig, BigBirdModel
-
-        # Build sequence
-        first_ten_tokens = list(self.big_tokenizer.get_vocab().keys())[:10]
-        sequence = " ".join(first_ten_tokens)
-        encoded_sequence = self.big_tokenizer.encode_plus(sequence, return_tensors="pt", return_token_type_ids=False)
-        batch_encoded_sequence = self.big_tokenizer.batch_encode_plus(
-            [sequence + " " + sequence], return_tensors="pt", return_token_type_ids=False
-        )
-
-        config = BigBirdConfig(attention_type="original_full")
-        model = BigBirdModel(config)
-
-        assert model.get_input_embeddings().weight.shape[0] >= self.big_tokenizer.vocab_size
-
-        with torch.no_grad():
-            model(**encoded_sequence)
-            model(**batch_encoded_sequence)
-
-    @slow
-    def test_special_tokens(self):
-        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
-        decoded_text = tokenizer.decode(tokenizer("Paris is the [MASK].").input_ids)
-
-        self.assertTrue(decoded_text == "[CLS] Paris is the[MASK].[SEP]")
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[65, 39286, 458, 36335, 2001, 456, 13073, 13266, 455, 113, 7746, 1741, 11157, 391, 13073, 13266, 455, 113, 3967, 35412, 113, 4936, 109, 3870, 2377, 113, 30084, 45720, 458, 134, 17496, 112, 503, 11672, 113, 118, 112, 5665, 13347, 38687, 112, 1496, 31389, 112, 3268, 47264, 134, 962, 112, 16377, 8035, 23130, 430, 12169, 15518, 28592, 458, 146, 41697, 109, 391, 12169, 15518, 16689, 458, 146, 41358, 109, 452, 726, 4034, 111, 763, 35412, 5082, 388, 1903, 111, 9051, 391, 2870, 48918, 1900, 1123, 550, 998, 112, 9586, 15985, 455, 391, 410, 22955, 37636, 114, 66], [65, 448, 17496, 419, 3663, 385, 763, 113, 27533, 2870, 3283, 13043, 1639, 24713, 523, 656, 24013, 18550, 2521, 517, 27014, 21244, 420, 1212, 1465, 391, 927, 4833, 388, 578, 11786, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [65, 484, 2169, 7687, 21932, 18146, 726, 363, 17032, 3391, 114, 66, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="google/bigbird-roberta-base",
-            revision="215c99f1600e06f83acce68422f2035b2b5c3510",
-        )
+    integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '▁', '😊\n', 'I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '\n生活的真谛是\n', 'Hi', '▁Hello', '\n', 'Hi', '▁Hello', '\n\n', '▁', '\n', '▁', '\n', '▁Hello', '\n', '', '▁', '\n', 'hi', '', '▁there', '\n', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '\n', 'But', '▁', 'ird', '▁and', '▁', 'ปี', '▁', 'ird', '▁', 'ด\n', 'Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [871, 419, 358, 1433, 321, 100, 141, 474, 4743, 388, 961, 11125, 112, 391, 529, 419, 27908, 266, 114, 100, 17351, 18536, 100, 17351, 18536, 100, 321, 100, 321, 100, 18536, 100, 2, 321, 100, 5404, 2, 713, 100, 565, 1809, 4832, 916, 408, 6206, 30341, 126, 18536, 114, 100, 1638, 321, 1548, 391, 321, 100, 321, 1548, 321, 100, 10915, 804, 490, 446, 1905]  # fmt: skip
+    expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '▁', '', 'I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '', 'Hi', '▁Hello', '', 'Hi', '▁Hello', '', '▁', '', '▁', '', '▁Hello', '', '', '▁', '', 'hi', '', '▁there', '', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '', 'But', '▁', 'ird', '▁and', '▁', '', '▁', 'ird', '▁', '', 'Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test I was born in 92000, and this is falsé.Hi HelloHi Hello   Hello hi thereThe following string should be properly encoded: Hello.But ird and  ird Hey how are you doing"
diff --git a/tests/models/biogpt/test_tokenization_biogpt.py b/tests/models/biogpt/test_tokenization_biogpt.py
index 9a219779827d..5540723f46b1 100644
--- a/tests/models/biogpt/test_tokenization_biogpt.py
+++ b/tests/models/biogpt/test_tokenization_biogpt.py
@@ -15,6 +15,8 @@
 
 import json
 import os
+import shutil
+import tempfile
 import unittest
 
 from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES, BioGptTokenizer
@@ -34,7 +36,7 @@ def setUpClass(cls):
         super().setUpClass()
 
         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
+        cls.vocab = [
             "l",
             "o",
             "w",
@@ -57,15 +59,10 @@ def setUpClass(cls):
             "wider",
             "",
         ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["l o 123", "lo w 1456", "e r 1789", ""]
+        cls.merges = ["l o 123", "lo w 1456", "e r 1789", ""]
 
         cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(cls.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
 
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
@@ -74,7 +71,43 @@ def get_input_output_texts(self, tokenizer):
 
     def test_full_tokenizer(self):
         """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
-        tokenizer = BioGptTokenizer(self.vocab_file, self.merges_file)
+
+        vocab = [
+            "l",
+            "o",
+            "w",
+            "e",
+            "r",
+            "s",
+            "t",
+            "i",
+            "d",
+            "n",
+            "w",
+            "r",
+            "t",
+            "lo",
+            "low",
+            "er",
+            "low",
+            "lowest",
+            "newer",
+            "wider",
+            "",
+        ]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges = ["l o 123", "lo w 1456", "e r 1789", ""]
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["vocab_file"])
+            merges_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["merges_file"])
+            shutil.copy(self.vocab_file, vocab_file)
+            shutil.copy(self.merges_file, merges_file)
+            with open(vocab_file, "w") as fp:
+                fp.write(json.dumps(vocab_tokens))
+            with open(merges_file, "w") as fp:
+                fp.write("\n".join(merges))
+            tokenizer = BioGptTokenizer(vocab_file, merges_file)
 
         text = "lower"
         bpe_tokens = ["low", "er"]
diff --git a/tests/models/blenderbot/test_tokenization_blenderbot.py b/tests/models/blenderbot/test_tokenization_blenderbot.py
index da6741940c90..8f7c60f2bf2e 100644
--- a/tests/models/blenderbot/test_tokenization_blenderbot.py
+++ b/tests/models/blenderbot/test_tokenization_blenderbot.py
@@ -1,52 +1,43 @@
-#!/usr/bin/env python3
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
-
 import unittest
-from functools import cached_property
-
-from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
-
-
-class Blenderbot3BTokenizerTests(unittest.TestCase):
-    @cached_property
-    def tokenizer_3b(self):
-        return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")
-
-    @cached_property
-    def rust_tokenizer_3b(self):
-        return BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")
-
-    def test_encode_decode_cycle(self):
-        tok = self.tokenizer_3b
-        src_text = " I am a small frog."
-        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
-        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        assert src_text == decoded
-
-    def test_encode_decode_cycle_rust_tokenizer(self):
-        tok = self.rust_tokenizer_3b
-        src_text = " I am a small frog."
-        encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
-        decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        assert src_text == decoded
-
-    def test_3B_tokenization_same_as_parlai(self):
-        assert self.tokenizer_3b.add_prefix_space
-        assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
 
-    def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
-        assert self.rust_tokenizer_3b.add_prefix_space
-        assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
+from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer
+from transformers.testing_utils import require_tokenizers
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class BlenderbotTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = ["facebook/blenderbot-3B"]
+    tokenizer_class = BlenderbotTokenizer
+
+    integration_expected_tokens = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '', 'Ġ', 'Ċ', 'hi', '', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [678, 315, 265, 1689, 3417, 240, 206, 48, 372, 3647, 302, 1207, 25, 1694, 19, 298, 381, 315, 284, 1095, 3952, 21, 206, 171, 250, 261, 170, 120, 127, 171, 256, 234, 171, 258, 261, 172, 116, 257, 170, 254, 115, 206, 47, 80, 228, 6950, 206, 47, 80, 228, 228, 6950, 206, 206, 228, 206, 228, 228, 206, 6950, 206, 1, 228, 206, 7417, 1, 505, 206, 2839, 3504, 7884, 636, 310, 3867, 2525, 621, 296, 33, 6950, 21, 206, 41, 329, 228, 1221, 298, 228, 164, 124, 257, 164, 124, 121, 228, 228, 228, 1221, 228, 228, 228, 164, 124, 250, 206, 47, 3110, 544, 366, 304, 929]  # fmt: skip
+    expected_tokens_from_ids = ['ĠThis', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠHello', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠHello', 'Ċ', 'Ċ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '', 'Ġ', 'Ċ', 'hi', '', 'Ġthere', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed', ':', 'ĠHello', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_decoded_text = " This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n \nhi there\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
+
+    def test_pretokenized_inputs(self, *args, **kwargs):
+        # It's very difficult to mix/test pretokenization with byte-level tokenizers
+        # The issue is that when you have a sequence with leading spaces, splitting it
+        # with .split() loses the leading spaces, so the tokenization results differ
+        pass
+
+    def test_tokenization_for_chat(self):
+        tok = self.get_tokenizer()
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2],
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2],
+            [3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
index 5bd8622fa5a0..e596d6e90565 100644
--- a/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
+++ b/tests/models/blenderbot_small/test_tokenization_blenderbot_small.py
@@ -16,6 +16,8 @@
 
 import json
 import os
+import shutil
+import tempfile
 import unittest
 
 from transformers.models.blenderbot_small.tokenization_blenderbot_small import (
@@ -31,45 +33,35 @@ class BlenderbotSmallTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = BlenderbotSmallTokenizer
     test_rust_tokenizer = False
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-
-        merges = ["#version: 0.2", "a p", "t e", "ap t", "a d", "ad apt", "a c", "ac t", ""]
-        cls.special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return BlenderbotSmallTokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "adapt act apte"
-        output_text = "adapt act apte"
-        return input_text, output_text
-
     def test_full_blenderbot_small_tokenizer(self):
-        tokenizer = BlenderbotSmallTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "adapt act apte"
-        bpe_tokens = ["adapt", "act", "ap@@", "te"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
-
-        input_bpe_tokens = [0, 1, 2, 3, 4, 5]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        # Create temporary directory for vocab files
+        tmpdirname = tempfile.mkdtemp()
+        try:
+            vocab = ["__start__", "adapt", "act", "ap@@", "te", "__end__", "__unk__"]
+            vocab_tokens = dict(zip(vocab, range(len(vocab))))
+
+            merges = ["#version: 0.2", "a p", "t e", "ap t", "a d", "ad apt", "a c", "ac t", ""]
+            special_tokens_map = {"unk_token": "__unk__", "bos_token": "__start__", "eos_token": "__end__"}
+
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+            with open(vocab_file, "w", encoding="utf-8") as fp:
+                fp.write(json.dumps(vocab_tokens) + "\n")
+            with open(merges_file, "w", encoding="utf-8") as fp:
+                fp.write("\n".join(merges))
+
+            tokenizer = BlenderbotSmallTokenizer(vocab_file, merges_file, **special_tokens_map)
+            text = "adapt act apte"
+            bpe_tokens = ["adapt", "act", "ap@@", "te"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = [tokenizer.bos_token] + tokens + [tokenizer.eos_token]
+
+            input_bpe_tokens = [0, 1, 2, 3, 4, 5]
+            self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        finally:
+            shutil.rmtree(tmpdirname)
 
     def test_special_tokens_small_tok(self):
         tok = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot-90M")
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 7c05340cec81..ba6c73d4f1bd 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -27,9 +27,9 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         BloomForCausalLM,
         BloomModel,
-        BloomTokenizerFast,
     )
 
 
@@ -520,7 +520,7 @@ def test_simple_generation(self):
         path_560m = "bigscience/bloom-560m"
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m)
+        tokenizer = AutoTokenizer.from_pretrained(path_560m)
 
         input_sentence = "I enjoy walking with my cute dog"
         # This output has been obtained using fp32 model on the huggingface DGX workstation - NVIDIA A100 GPU
@@ -540,7 +540,7 @@ def test_batch_generation(self):
         path_560m = "bigscience/bloom-560m"
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
+        tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
 
         input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]
 
@@ -560,7 +560,7 @@ def test_batch_generation_padding(self):
         path_560m = "bigscience/bloom-560m"
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
+        tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
 
         input_sentence = ["I enjoy walking with my cute dog", "Hello my name is"]
         input_sentence_without_pad = "Hello my name is"
@@ -590,7 +590,7 @@ def test_batch_generated_text(self):
 
         model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").to(torch_device)
         model = model.eval()
-        tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")
+        tokenizer = AutoTokenizer.from_pretrained(path_560m, padding_side="left")
 
         input_sentences = [
             "Hello what is",
diff --git a/tests/models/bloom/test_tokenization_bloom.py b/tests/models/bloom/test_tokenization_bloom.py
index 4a4840dfd9f3..267d377ed8e5 100644
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import unittest
 
 from datasets import load_dataset
 
-from transformers import BloomTokenizerFast
-from transformers.testing_utils import require_tokenizers
+from transformers import TokenizersBackend
+from transformers.testing_utils import require_jinja, require_tokenizers, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -27,50 +26,43 @@
 class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "bigscience/tokenizer"
     slow_tokenizer_class = None
-    rust_tokenizer_class = BloomTokenizerFast
-    tokenizer_class = BloomTokenizerFast
-    test_rust_tokenizer = True
+    rust_tokenizer_class = TokenizersBackend
+    tokenizer_class = TokenizersBackend
     test_slow_tokenizer = False
     from_pretrained_vocab_key = "tokenizer_file"
     special_tokens_map = {"bos_token": "", "eos_token": "", "unk_token": "", "pad_token": ""}
 
+    # Integration test data - expected outputs for the default input string
+    integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.Ċ', 'çĶŁæ´»çļĦ', '羣', 'è°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'ĠĠ', 'ĠHello', 'ĊĊ', 'ĠĊ', 'ĠĠĊ', 'ĠHello', 'Ċ', '', 'Ċ', 'hi', '', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġenc', 'od', 'ed:', 'ĠHello', '.Ċ', 'But', 'Ġir', 'd', 'Ġand', 'Ġà¸', 'Ľ', 'ี', 'ĠĠ', 'Ġir', 'd', 'ĠĠ', 'Ġà¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [6168, 632, 267, 4006, 189, 44, 1620, 34181, 361, 1575, 14739, 15, 530, 1119, 632, 31684, 311, 336, 71167, 4137, 1927, 239, 644, 189, 30050, 210, 86153, 189, 30050, 250, 86153, 603, 5306, 33249, 86153, 189, 1, 189, 2807, 1, 51596, 189, 2175, 6747, 5148, 3403, 722, 34975, 2681, 532, 29315, 86153, 336, 6475, 2881, 71, 530, 44381, 239, 105442, 250, 2881, 71, 250, 44381, 232, 189, 40440, 4143, 1306, 1152, 12491]  # fmt: skip
+
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        tokenizer = BloomTokenizerFast.from_pretrained("bigscience/tokenizer")
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        _kwargs = copy.deepcopy(cls.special_tokens_map)
-        _kwargs.update(kwargs)
-        kwargs = _kwargs
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return BloomTokenizerFast.from_pretrained(pretrained_name, **kwargs)
 
-    @unittest.skip(reason="This needs a slow tokenizer. Bloom does not have one!")
-    def test_encode_decode_with_spaces(self):
-        return
+        tokenizer = TokenizersBackend.from_pretrained("bigscience/tokenizer")
+        tokenizer.save_pretrained(cls.tmpdirname)
+        cls.tokenizers_list = [(cls.rust_tokenizer_class, cls.tmpdirname, {})]
 
     def test_encodings_from_sample_data(self):
         """
         Assert that the created tokens are the same than the hard-coded ones
         """
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
 
         INPUT_SENTENCES = ["The quick brown fox", "jumps over the lazy dog"]
         TARGET_TOKENS = [[2175, 23714, 73173, 144252, 2], [77, 132619, 3478, 368, 109586, 35433, 2]]
 
-        computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"]
+        computed_tokens = tokenizer(INPUT_SENTENCES)["input_ids"]
         self.assertListEqual(TARGET_TOKENS, computed_tokens)
 
-        decoded_tokens = tokenizer.batch_decode(computed_tokens)
+        decoded_tokens = tokenizer.decode(computed_tokens)
         self.assertListEqual(decoded_tokens, INPUT_SENTENCES)
 
     def test_padding(self, max_length=6):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
                 # tokenizer_r.pad_token = None # Hotfixing padding = None
                 # Simple input
                 s = "This is a simple input"
@@ -84,11 +76,11 @@ def test_padding(self, max_length=6):
                 # Simple input tests
                 try:
                     tokenizer_r.encode(s, max_length=max_length)
-                    tokenizer_r.encode_plus(s, max_length=max_length)
+                    tokenizer_r(s, max_length=max_length)
 
-                    tokenizer_r.batch_encode_plus(s2, max_length=max_length)
+                    tokenizer_r(s2, max_length=max_length)
                     tokenizer_r.encode(p, max_length=max_length)
-                    tokenizer_r.batch_encode_plus(p2, max_length=max_length)
+                    tokenizer_r(p2, max_length=max_length)
                 except ValueError:
                     self.fail("Bloom Tokenizer should be able to deal with padding")
 
@@ -96,12 +88,12 @@ def test_padding(self, max_length=6):
                 self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
 
                 # Simple input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
+                self.assertRaises(ValueError, tokenizer_r, s, max_length=max_length, padding="max_length")
 
                 # Simple input
                 self.assertRaises(
                     ValueError,
-                    tokenizer_r.batch_encode_plus,
+                    tokenizer_r,
                     s2,
                     max_length=max_length,
                     padding="max_length",
@@ -111,12 +103,12 @@ def test_padding(self, max_length=6):
                 self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
 
                 # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
+                self.assertRaises(ValueError, tokenizer_r, p, max_length=max_length, padding="max_length")
 
                 # Pair input
                 self.assertRaises(
                     ValueError,
-                    tokenizer_r.batch_encode_plus,
+                    tokenizer_r,
                     p2,
                     max_length=max_length,
                     padding="max_length",
@@ -127,7 +119,7 @@ def test_encodings_from_xnli_dataset(self):
         Tests the tokenizer downloaded from here:
             - https://huggingface.co/bigscience/tokenizer/
         """
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         ds = load_dataset("facebook/xnli", "all_languages", split="test", streaming=True)
 
         sample_data = next(iter(ds))["premise"]  # pick up one data
@@ -137,9 +129,35 @@ def test_encodings_from_xnli_dataset(self):
         predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
         self.assertListEqual(predicted_text, input_text)
 
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.get_tokenizer()
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2],
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2],
+            [229126, 427, 11890, 1152, 17, 2, 59414, 4, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
     def test_add_prefix_space_fast(self):
-        tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
-        tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
+        tokenizer_w_prefix = self.get_tokenizer(add_prefix_space=True)
+        tokenizer_wo_prefix = self.get_tokenizer(add_prefix_space=False)
         tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
         tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
         self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)
+
+    @slow
+    def test_save_and_load_tokenizer(self):
+        return super().test_save_and_load_tokenizer()
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index baadfc67c2b8..be05c3db866f 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
 import re
 import shutil
 import tempfile
 import unittest
 from functools import cached_property
 
-from transformers import AddedToken, BatchEncoding, ByT5Tokenizer
+from transformers import BatchEncoding, ByT5Tokenizer
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
 class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = ByT5Tokenizer
+    from_pretrained_id = "google/byt5-small"
     test_rust_tokenizer = False
 
     @classmethod
@@ -189,10 +188,10 @@ def test_save_and_load_tokenizer(self):
 
                 sample_text = " He is very happy, UNwant\u00e9d,running"
                 tokenizer.add_tokens(["bim", "bambam"])
-                additional_special_tokens = tokenizer.additional_special_tokens
-                additional_special_tokens.append("new_additional_special_token")
+                extra_special_tokens = tokenizer.extra_special_tokens
+                extra_special_tokens.append("new_extra_special_token")
                 tokenizer.add_special_tokens(
-                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                    {"extra_special_tokens": extra_special_tokens}, replace_extra_special_tokens=False
                 )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
@@ -200,7 +199,7 @@ def test_save_and_load_tokenizer(self):
                 after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
                 after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
                 self.assertListEqual(before_tokens, after_tokens)
-                self.assertIn("new_additional_special_token", after_tokenizer.additional_special_tokens)
+                self.assertIn("new_extra_special_token", after_tokenizer.extra_special_tokens)
                 self.assertEqual(after_tokenizer.model_max_length, 42)
 
                 tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
@@ -208,77 +207,8 @@ def test_save_and_load_tokenizer(self):
 
                 shutil.rmtree(tmpdirname)
 
-    # There is a conflict between the default value of extra_ids and adding a new special token through additional_special_tokens
-    # We need to add the extra_ids in the list of the arg additional_special_tokens
-    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
-        tokenizer_list = []
-        if self.test_slow_tokenizer:
-            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
-
-        if self.test_rust_tokenizer:
-            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
-
-        for tokenizer_class, tokenizer_utils in tokenizer_list:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                tokenizer_utils.save_pretrained(tmp_dir)
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
-                    special_tokens_map = json.load(json_file)
-
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
-                    tokenizer_config = json.load(json_file)
-
-                added_tokens_extra_ids = [f"" for i in range(125)]
-
-                special_tokens_map["additional_special_tokens"] = added_tokens_extra_ids + [
-                    "an_additional_special_token"
-                ]
-                tokenizer_config["additional_special_tokens"] = added_tokens_extra_ids + [
-                    "an_additional_special_token"
-                ]
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(special_tokens_map, outfile)
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(tokenizer_config, outfile)
-
-                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
-                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
-                # "special_tokens_map.json" files
-                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                )
-                self.assertIn(
-                    "an_additional_special_token", tokenizer_without_change_in_init.additional_special_tokens
-                )
-                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
-                self.assertEqual(
-                    ["an_additional_special_token"],
-                    tokenizer_without_change_in_init.convert_ids_to_tokens(
-                        tokenizer_without_change_in_init.convert_tokens_to_ids(["an_additional_special_token"])
-                    ),
-                )
-
-                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
-                new_added_tokens = added_tokens_extra_ids + [AddedToken("a_new_additional_special_token", lstrip=True)]
-                tokenizer = tokenizer_class.from_pretrained(
-                    tmp_dir,
-                    additional_special_tokens=new_added_tokens,
-                )
-
-                self.assertIn("a_new_additional_special_token", tokenizer.additional_special_tokens)
-                self.assertEqual(
-                    ["a_new_additional_special_token"],
-                    tokenizer.convert_ids_to_tokens(
-                        tokenizer.convert_tokens_to_ids(["a_new_additional_special_token"])
-                    ),
-                )
-
     def test_decode_single_bytes(self):
         tokenizer_list = []
-        if self.test_slow_tokenizer:
-            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
-
         if self.test_rust_tokenizer:
             tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
 
@@ -342,11 +272,3 @@ def test_tokenizers_common_ids_setters(self):
                     setattr(tokenizer, attr + "_id", token_id_to_test_setters)
                     self.assertEqual(getattr(tokenizer, attr), token_to_test_setters)
                     self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters)
-
-                setattr(tokenizer, "additional_special_tokens_ids", [])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [])
-
-                setattr(tokenizer, "additional_special_tokens_ids", [token_id_to_test_setters])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
-                self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
diff --git a/tests/models/camembert/test_tokenization_camembert.py b/tests/models/camembert/test_tokenization_camembert.py
index 704c757fc41d..84850c6d8208 100644
--- a/tests/models/camembert/test_tokenization_camembert.py
+++ b/tests/models/camembert/test_tokenization_camembert.py
@@ -1,217 +1,17 @@
-# Copyright 2018 HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tempfile
 import unittest
-from tempfile import TemporaryDirectory
 
-from transformers import AddedToken, CamembertTokenizer, CamembertTokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.models.camembert.tokenization_camembert import CamembertTokenizer
+from transformers.testing_utils import require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-SAMPLE_BPE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe.model")
-
-
-@require_sentencepiece
 @require_tokenizers
 class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "almanach/camembert-base"
+    from_pretrained_id = ["almanach/camembert-base"]
     tokenizer_class = CamembertTokenizer
-    rust_tokenizer_class = CamembertTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = CamembertTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    @unittest.skip(
-        "Token maps are not equal because someone set the probability of ('NOTUSED', -100), so it's never encoded for fast"
-    )
-    def test_special_tokens_map_equal(self):
-        return
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 1  # 1 is the offset id, but in the spm vocab it's 3
-
-        self.assertEqual(self.get_tokenizer().convert_tokens_to_ids(token), token_id)
-        self.assertEqual(self.get_tokenizer().convert_ids_to_tokens(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "NOTUSED")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "")
-        self.assertEqual(len(vocab_keys), 1_005)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
-
-    def test_rust_and_python_bpe_tokenizers(self):
-        tokenizer = CamembertTokenizer(SAMPLE_BPE_VOCAB)
-        with TemporaryDirectory() as tmpdirname:
-            tokenizer.save_pretrained(tmpdirname)
-            rust_tokenizer = CamembertTokenizerFast.from_pretrained(tmpdirname)
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        #  tokens are not the same for `rust` than for `slow`.
-        # Because spm gives back raw token instead of `unk` in EncodeAsPieces
-        # tokens = tokenizer.tokenize(sequence)
-        tokens = tokenizer.convert_ids_to_tokens(ids)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        # camembert is a french model. So we also use french texts.
-        sequences = [
-            "Le transformeur est un modèle d'apprentissage profond introduit en 2017, "
-            "utilisé principalement dans le domaine du traitement automatique des langues (TAL).",
-            "À l'instar des réseaux de neurones récurrents (RNN), les transformeurs sont conçus "
-            "pour gérer des données séquentielles, telles que le langage naturel, pour des tâches "
-            "telles que la traduction et la synthèse de texte.",
-        ]
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="almanach/camembert-base",
-            revision="3a0641d9a1aeb7e848a74299e7e4c4bca216b4cf",
-            sequences=sequences,
-        )
-
-    # Overwritten because we have to use from slow (online pretrained is wrong, the tokenizer.json has a whole)
-    def test_added_tokens_serialization(self):
-        self.maxDiff = None
-
-        # Utility to test the added vocab
-        def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir):
-            tokenizer = tokenizer_class.from_pretrained(temp_dir)
-            self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
-            self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
-            self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
-            self.assertTrue(all(item in tokenizer.added_tokens_decoder.items() for item in expected.items()))
-            return tokenizer
-
-        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                # Load a slow tokenizer from the hub, init with the new token for fast to also include it
-                tokenizer = self.get_tokenizer(pretrained_name, eos_token=new_eos)
-                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
-                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
-                    self.assertEqual(tokenizer._special_tokens_map["eos_token"], new_eos)
-                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
-
-                with tempfile.TemporaryDirectory() as tmp_dir_2:
-                    tokenizer.save_pretrained(tmp_dir_2)
-                    with self.subTest(
-                        "Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class"
-                    ):
-                        _test_added_vocab_and_eos(
-                            EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2
-                        )
-
-                    if self.rust_tokenizer_class is not None:
-                        with self.subTest(
-                            "Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class"
-                        ):
-                            tokenizer_fast = _test_added_vocab_and_eos(
-                                EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2
-                            )
-                            with tempfile.TemporaryDirectory() as tmp_dir_3:
-                                tokenizer_fast.save_pretrained(tmp_dir_3)
-                                with self.subTest(
-                                    "Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class"
-                                ):
-                                    _test_added_vocab_and_eos(
-                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
-                                    )
-
-                                with self.subTest(
-                                    "Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class"
-                                ):
-                                    _test_added_vocab_and_eos(
-                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
-                                    )
-
-                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
-                    if self.rust_tokenizer_class is not None:
-                        tokenizer_fast = self.get_rust_tokenizer(pretrained_name, eos_token=new_eos, from_slow=True)
-                        self.assertEqual(tokenizer_fast._special_tokens_map["eos_token"], new_eos)
-                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
-                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
-                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
-                            with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
-                                self.assertTrue(
-                                    all(
-                                        item in tokenizer.added_tokens_decoder.items()
-                                        for item in EXPECTED_ADDED_TOKENS_DECODER.items()
-                                    )
-                                )
-
-                        EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
-                        with tempfile.TemporaryDirectory() as tmp_dir_4:
-                            tokenizer_fast.save_pretrained(tmp_dir_4)
-                            with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"):
-                                _test_added_vocab_and_eos(
-                                    EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4
-                                )
 
-                            with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"):
-                                _test_added_vocab_and_eos(
-                                    EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4
-                                )
+    integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '▁', '😊', '▁I', '▁was', '▁', 'born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 'sé', '.', '▁', '生活的真谛是', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '▁hi', '', '▁the', 're', '▁The', '▁', 'follow', 'ing', '▁string', '▁s', 'h', 'ould', '▁be', '▁pro', 'per', 'ly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁i', 'rd', '▁and', '▁', 'ปี', '▁i', 'rd', '▁', 'ด', '▁Hey', '▁h', 'ow', '▁are', '▁you', '▁do', 'ing']  # fmt: skip
+    integration_expected_token_ids = [17526, 2856, 33, 2006, 21, 3, 551, 15760, 21, 24900, 378, 419, 13233, 7, 1168, 9098, 2856, 19289, 5100, 9, 21, 3, 5108, 9774, 5108, 9774, 9774, 5, 7874, 5, 808, 346, 908, 21, 31189, 402, 20468, 52, 133, 19306, 2446, 909, 1399, 1107, 22, 14420, 204, 92, 9774, 9, 10503, 1723, 6682, 1168, 21, 3, 1723, 6682, 21, 3, 20128, 616, 3168, 9581, 4835, 7503, 402]  # fmt: skip
+    expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '▁', '', '▁I', '▁was', '▁', 'born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 'sé', '.', '▁', '', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '', '▁hi', '', '▁the', 're', '▁The', '▁', 'follow', 'ing', '▁string', '▁s', 'h', 'ould', '▁be', '▁pro', 'per', 'ly', '▁en', 'code', 'd', ':', '▁Hello', '.', '▁But', '▁i', 'rd', '▁and', '▁', '', '▁i', 'rd', '▁', '', '▁Hey', '▁h', 'ow', '▁are', '▁you', '▁do', 'ing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test  I was born in 92000, and this is falsé.  Hi Hello Hi Hello Hello hi there The following string should be properly encoded: Hello. But ird and  ird  Hey how are you doing"
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index d9f7154a432d..1db007c08b2f 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
 import shutil
 import tempfile
 import unittest
@@ -21,7 +19,7 @@
 
 from transformers import BatchEncoding, CanineTokenizer
 from transformers.testing_utils import require_tokenizers, require_torch
-from transformers.tokenization_utils import AddedToken
+from transformers.tokenization_python import AddedToken
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -29,6 +27,7 @@
 class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "nielsr/canine-s"
     tokenizer_class = CanineTokenizer
+    test_slow_tokenizer = True
     test_rust_tokenizer = False
 
     @classmethod
@@ -118,13 +117,13 @@ def test_save_and_load_tokenizer(self):
 
                 sample_text = " He is very happy, UNwant\u00e9d,running"
 
-                additional_special_tokens = tokenizer.additional_special_tokens
+                extra_special_tokens = tokenizer.extra_special_tokens
 
                 # We can add a new special token for Canine as follows:
-                new_additional_special_token = chr(0xE007)
-                additional_special_tokens.append(new_additional_special_token)
+                new_extra_special_token = chr(0xE007)
+                extra_special_tokens.append(new_extra_special_token)
                 tokenizer.add_special_tokens(
-                    {"additional_special_tokens": additional_special_tokens}, replace_additional_special_tokens=False
+                    {"extra_special_tokens": extra_special_tokens}, replace_extra_special_tokens=False
                 )
                 before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
                 tokenizer.save_pretrained(tmpdirname)
@@ -132,7 +131,7 @@ def test_save_and_load_tokenizer(self):
                 after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
                 after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
                 self.assertListEqual(before_tokens, after_tokens)
-                self.assertIn(new_additional_special_token, after_tokenizer.additional_special_tokens)
+                self.assertIn(new_extra_special_token, after_tokenizer.extra_special_tokens)
                 self.assertEqual(after_tokenizer.model_max_length, 42)
 
                 tokenizer = tokenizer.__class__.from_pretrained(tmpdirname, model_max_length=43)
@@ -171,7 +170,7 @@ def test_tokenize_special_tokens(self):
                 SPECIAL_TOKEN_1 = chr(0xE005)
                 SPECIAL_TOKEN_2 = chr(0xE006)
                 tokenizer.add_tokens([SPECIAL_TOKEN_1], special_tokens=True)
-                tokenizer.add_special_tokens({"additional_special_tokens": [SPECIAL_TOKEN_2]})
+                tokenizer.add_special_tokens({"extra_special_tokens": [SPECIAL_TOKEN_2]})
 
                 token_1 = tokenizer.tokenize(SPECIAL_TOKEN_1)
                 token_2 = tokenizer.tokenize(SPECIAL_TOKEN_2)
@@ -191,69 +190,12 @@ def test_added_token_serializable(self):
                 new_token = chr(NEW_TOKEN)
 
                 new_token = AddedToken(new_token, lstrip=True)
-                tokenizer.add_special_tokens({"additional_special_tokens": [new_token]})
+                tokenizer.add_special_tokens({"extra_special_tokens": [new_token]})
 
                 with tempfile.TemporaryDirectory() as tmp_dir_name:
                     tokenizer.save_pretrained(tmp_dir_name)
                     tokenizer.from_pretrained(tmp_dir_name)
 
-    def test_special_tokens_initialization_with_non_empty_additional_special_tokens(self):
-        tokenizer_list = []
-        if self.test_slow_tokenizer:
-            tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
-
-        if self.test_rust_tokenizer:
-            tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
-
-        for tokenizer_class, tokenizer_utils in tokenizer_list:
-            with tempfile.TemporaryDirectory() as tmp_dir:
-                tokenizer_utils.save_pretrained(tmp_dir)
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), encoding="utf-8") as json_file:
-                    special_tokens_map = json.load(json_file)
-
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), encoding="utf-8") as json_file:
-                    tokenizer_config = json.load(json_file)
-
-                # a special token for Canine can be defined as follows:
-                NEW_TOKEN = 0xE006
-                new_token_1 = chr(NEW_TOKEN)
-
-                special_tokens_map["additional_special_tokens"] = [new_token_1]
-                tokenizer_config["additional_special_tokens"] = [new_token_1]
-
-                with open(os.path.join(tmp_dir, "special_tokens_map.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(special_tokens_map, outfile)
-                with open(os.path.join(tmp_dir, "tokenizer_config.json"), "w", encoding="utf-8") as outfile:
-                    json.dump(tokenizer_config, outfile)
-
-                # the following checks allow us to verify that our test works as expected, i.e. that the tokenizer takes
-                # into account the new value of additional_special_tokens given in the "tokenizer_config.json" and
-                # "special_tokens_map.json" files
-                tokenizer_without_change_in_init = tokenizer_class.from_pretrained(tmp_dir, extra_ids=0)
-                self.assertIn(new_token_1, tokenizer_without_change_in_init.additional_special_tokens)
-                # self.assertIn("an_additional_special_token",tokenizer_without_change_in_init.get_vocab()) # ByT5Tokenization no vocab
-                self.assertEqual(
-                    [new_token_1],
-                    tokenizer_without_change_in_init.convert_ids_to_tokens(
-                        tokenizer_without_change_in_init.convert_tokens_to_ids([new_token_1])
-                    ),
-                )
-
-                NEW_TOKEN = 0xE007
-                new_token_2 = chr(NEW_TOKEN)
-                # Now we test that we can change the value of additional_special_tokens in the from_pretrained
-                new_added_tokens = [AddedToken(new_token_2, lstrip=True)]
-                tokenizer = tokenizer_class.from_pretrained(
-                    tmp_dir, additional_special_tokens=new_added_tokens, extra_ids=0
-                )
-
-                self.assertIn(new_token_2, tokenizer.additional_special_tokens)
-                # self.assertIn(new_token_2,tokenizer.get_vocab()) # ByT5Tokenization no vocab
-                self.assertEqual(
-                    [new_token_2], tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids([new_token_2]))
-                )
-
     @require_tokenizers
     def test_encode_decode_with_spaces(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -295,15 +237,15 @@ def test_tokenizers_common_ids_setters(self):
             self.assertEqual(getattr(tokenizer, attr), token_to_test_setters)
             self.assertEqual(getattr(tokenizer, attr + "_id"), token_id_to_test_setters)
 
-        setattr(tokenizer, "additional_special_tokens_ids", [])
-        self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [])
-        self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [])
+        setattr(tokenizer, "extra_special_tokens_ids", [])
+        self.assertListEqual(getattr(tokenizer, "extra_special_tokens"), [])
+        self.assertListEqual(getattr(tokenizer, "extra_special_tokens_ids"), [])
 
         additional_special_token_id = 0xE006
         additional_special_token = chr(additional_special_token_id)
-        setattr(tokenizer, "additional_special_tokens_ids", [additional_special_token_id])
-        self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [additional_special_token])
-        self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [additional_special_token_id])
+        setattr(tokenizer, "extra_special_tokens_ids", [additional_special_token_id])
+        self.assertListEqual(getattr(tokenizer, "extra_special_tokens"), [additional_special_token])
+        self.assertListEqual(getattr(tokenizer, "extra_special_tokens_ids"), [additional_special_token_id])
 
     @unittest.skip(reason="tokenizer has a fixed vocab_size (namely all possible unicode code points)")
     def test_add_tokens_tokenizer(self):
diff --git a/tests/models/chameleon/test_processing_chameleon.py b/tests/models/chameleon/test_processing_chameleon.py
index f8104b937ecf..d0ebf3617e3d 100644
--- a/tests/models/chameleon/test_processing_chameleon.py
+++ b/tests/models/chameleon/test_processing_chameleon.py
@@ -66,7 +66,7 @@ def test_special_mm_token_truncation(self):
                 return_tensors="pt",
                 truncation=True,
                 padding=True,
-                max_length=20,
+                max_length=2,
             )
 
     @staticmethod
diff --git a/tests/models/clap/test_processing_clap.py b/tests/models/clap/test_processing_clap.py
index ce6000ed2c57..dac375cb2a49 100644
--- a/tests/models/clap/test_processing_clap.py
+++ b/tests/models/clap/test_processing_clap.py
@@ -16,8 +16,9 @@
 import tempfile
 import unittest
 
-from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer, RobertaTokenizerFast
+from transformers import ClapFeatureExtractor, ClapProcessor, RobertaTokenizer
 from transformers.testing_utils import require_sentencepiece, require_torchaudio
+from transformers.tokenization_utils_tokenizers import TokenizersBackend
 
 from .test_feature_extraction_clap import floats_list
 
@@ -48,7 +49,7 @@ def test_save_load_pretrained_default(self):
         processor = ClapProcessor.from_pretrained(self.tmpdirname)
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+        self.assertIsInstance(processor.tokenizer, TokenizersBackend)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
         self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
@@ -65,7 +66,7 @@ def test_save_load_pretrained_additional_features(self):
         )
 
         self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
-        self.assertIsInstance(processor.tokenizer, RobertaTokenizerFast)
+        self.assertIsInstance(processor.tokenizer, TokenizersBackend)
 
         self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
         self.assertIsInstance(processor.feature_extractor, ClapFeatureExtractor)
diff --git a/tests/models/clip/test_tokenization_clip.py b/tests/models/clip/test_tokenization_clip.py
index 06651fc0f206..3f2c05aa8285 100644
--- a/tests/models/clip/test_tokenization_clip.py
+++ b/tests/models/clip/test_tokenization_clip.py
@@ -1,25 +1,7 @@
-# Copyright 2021 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
 import unittest
 
-from transformers import CLIPTokenizer, CLIPTokenizerFast
-from transformers.models.clip.tokenization_clip import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_ftfy, require_tokenizers
+from transformers.models.clip.tokenization_clip import CLIPTokenizer
+from transformers.testing_utils import require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -28,160 +10,34 @@
 class CLIPTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "openai/clip-vit-base-patch32"
     tokenizer_class = CLIPTokenizer
-    rust_tokenizer_class = CLIPTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_kwargs = {}
-    test_seq2seq = False
+
+    integration_expected_tokens = ['this', 'is', 'a', 'test', 'ðŁĺĬ', 'i', 'was', 'born', 'in', '9', '2', '0', '0', '0', ',', 'and', 'this', 'is', 'fal', 's', 'é', '.', 'çĶŁ', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', '羣', 'è', '°', 'Ľ', 'æĺ', '¯', 'hi', 'hello', 'hi', 'hello', 'hello', '<', 's', '>', 'hi', '<', 's', '>', 'there', 'the', 'following', 'string', 'should', 'be', 'properly', 'en', 'coded', ':', 'hello', '.', 'but', 'ird', 'and', 'à¸', 'Ľ', 'ี', 'ird', 'à¸Ķ', 'hey', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_token_ids = [589, 533, 320, 1628, 3020, 328, 739, 2683, 530, 280, 273, 271, 271, 271, 267, 537, 589, 533, 2778, 82, 4166, 269, 33375, 162, 112, 119, 163, 248, 226, 41570, 164, 108, 249, 42891, 363, 1883, 3306, 1883, 3306, 3306, 283, 338, 285, 1883, 283, 338, 285, 997, 518, 3473, 9696, 1535, 655, 12560, 524, 33703, 281, 3306, 269, 767, 2770, 537, 1777, 505, 20278, 2770, 38825, 2189, 829, 631, 592, 1960]  # fmt: skip
+    expected_tokens_from_ids = ['this', 'is', 'a', 'test', 'ðŁĺĬ', 'i', 'was', 'born', 'in', '9', '2', '0', '0', '0', ',', 'and', 'this', 'is', 'fal', 's', 'é', '.', 'çĶŁ', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', '羣', 'è', '°', 'Ľ', 'æĺ', '¯', 'hi', 'hello', 'hi', 'hello', 'hello', '<', 's', '>', 'hi', '<', 's', '>', 'there', 'the', 'following', 'string', 'should', 'be', 'properly', 'en', 'coded', ':', 'hello', '.', 'but', 'ird', 'and', 'à¸', 'Ľ', 'ี', 'ird', 'à¸Ķ', 'hey', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_decoded_text = "this is a test 😊 i was born in 9 2 0 0 0 , and this is falsé . 生活的真谛是 hi hello hi hello hello < s > hi < s > there the following string should be properly encoded : hello . but ird and ป ี ird ด hey how are you doing"
 
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
+        from_pretrained_id = "openai/clip-vit-base-patch32"
+        tokenizer = CLIPTokenizer.from_pretrained(from_pretrained_id)
+        tokenizer.pad_token = getattr(tokenizer, "pad_token", None) or getattr(tokenizer, "eos_token", None)
+        tokenizer.save_pretrained(cls.tmpdirname)
 
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "lo", "l", "w", "r", "t", "low", "er", "lowest", "newer", "wider", "", "<|startoftext|>", "<|endoftext|>"]  # fmt: skip
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "l o", "lo w", "e r"]
+        cls.vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        merges_raw = ["#version: 0.2", "l o", "lo w", "e r"]
         cls.special_tokens_map = {"unk_token": ""}
 
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return CLIPTokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return CLIPTokenizerFast.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = CLIPTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["lo", "w", "er", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [10, 2, 16, 9, 3, 2, 16, 20]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @require_ftfy
-    def test_check_encoding_slow_fast(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_s = self.get_tokenizer(pretrained_name, **kwargs)
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                text = "A\n'll 11p223RF☆ho!!to?'d'd''d of a cat to-$''d."
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on an example containing a character (Latin Small Letter A
-                # with Tilde) encoded in 2 different ways
-                text = "xa\u0303y" + " " + "x\xe3y"
-                text_tokenized_s = tokenizer_s.tokenize(text)
-                text_tokenized_r = tokenizer_r.tokenize(text)
-
-                self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of space type
-                spaces_unicodes = [
-                    "\u0009",  # (horizontal tab, '\t')
-                    "\u000b",  # (vertical tab)
-                    "\u000c",  # (form feed)
-                    "\u0020",  # (space, ' ')
-                    "\u200e",  # (left-to-right mark):w
-                    "\u200f",  # (right-to-left mark)
-                ]
-                for unicode_seq in spaces_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-                # Test that the tokenization is identical on unicode of line break type
-                line_break_unicodes = [
-                    "\u000a",  # (line feed, '\n')
-                    "\r\n",  # (carriage return and line feed, '\r\n')
-                    "\u000d",  # (carriage return, '\r')
-                    "\r",  # (carriage return, '\r')
-                    "\u000d",  # (carriage return, '\r')
-                    "\u2028",  # (line separator)
-                    "\u2029",  # (paragraph separator)
-                    # "\u0085", # (next line)
-                ]
-
-                # The tokenization is not identical for the character "\u0085" (next line). The slow version using ftfy transforms
-                # it into the Horizontal Ellipsis character "…" ("\u2026") while the fast version transforms it into a
-                # space (and thus into an empty list).
-
-                for unicode_seq in line_break_unicodes:
-                    text_tokenized_s = tokenizer_s.tokenize(unicode_seq)
-                    text_tokenized_r = tokenizer_r.tokenize(unicode_seq)
-
-                    self.assertListEqual(text_tokenized_s, text_tokenized_r)
-
-    def test_offsets_mapping_with_different_add_prefix_space_argument(self):
-        # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space`
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                text_of_1_token = "hello"  # `hello` is a token in the vocabulary of `pretrained_name`
-                text = f"{text_of_1_token} {text_of_1_token}"
-
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (0, len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (len(text_of_1_token) + 1, len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-                text = f" {text}"
-
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name,
-                    use_fast=True,
-                )
-                encoding = tokenizer_r(text, return_offsets_mapping=True, add_special_tokens=False)
-                self.assertEqual(encoding.offset_mapping[0], (1, 1 + len(text_of_1_token)))
-                self.assertEqual(
-                    encoding.offset_mapping[1],
-                    (1 + len(text_of_1_token) + 1, 1 + len(text_of_1_token) + 1 + len(text_of_1_token)),
-                )
-
-    def test_log_warning(self):
-        # Test related to the breaking change introduced in transformers v4.17.0
-        # We need to check that an error in raised when the user try to load a previous version of the tokenizer.
-        with self.assertRaises(TypeError) as context:
-            self.get_rust_tokenizer("robot-test/old-clip-tokenizer")
+        cls.merges = []
+        for line in merges_raw:
+            line = line.strip()
+            if line and not line.startswith("#"):
+                cls.merges.append(tuple(line.split()))
 
-        self.assertTrue(
-            context.exception.args[0].startswith(
-                "The `backend_tokenizer` provided does not match the expected format."
-            )
-        )
+        tokenizer_from_vocab = CLIPTokenizer(vocab=cls.vocab_tokens, merges=cls.merges)
 
-    @require_ftfy
-    def test_tokenization_python_rust_equals(self):
-        super().test_tokenization_python_rust_equals()
+        cls.tokenizers = [tokenizer, tokenizer_from_vocab]
 
-    @unittest.skip(reason="CLIP always lower cases letters")
-    def test_added_tokens_do_lower_case(self):
-        pass
+    def test_padding_to_multiple_of(self):
+        self.skipTest("Skipping padding to multiple of test bc vocab is too small.")
diff --git a/tests/models/clvp/test_tokenization_clvp.py b/tests/models/clvp/test_tokenization_clvp.py
index e1c030272fed..40fedcea7d3c 100644
--- a/tests/models/clvp/test_tokenization_clvp.py
+++ b/tests/models/clvp/test_tokenization_clvp.py
@@ -18,8 +18,9 @@
 import unittest
 
 from transformers import ClvpTokenizer
+from transformers.testing_utils import slow
 
-from ...test_tokenization_common import TokenizerTesterMixin, slow
+from ...test_tokenization_common import TokenizerTesterMixin
 
 
 class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
@@ -70,6 +71,12 @@ def setUpClass(cls):
         with open(cls.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
+        # Remove files from parent class loading from hub to avoid conflicts
+        for filename in ["added_tokens.json", "special_tokens_map.json", "tokenizer_config.json"]:
+            filepath = os.path.join(cls.tmpdirname, filename)
+            if os.path.exists(filepath):
+                os.remove(filepath)
+
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_tokenizer with GPT2->Clvp
     @classmethod
     def get_tokenizer(cls, pretrained_name=None, **kwargs):
@@ -80,7 +87,7 @@ def get_tokenizer(cls, pretrained_name=None, **kwargs):
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
-        output_text = "lower[SPACE]newer"
+        output_text = "lower[SPACE]newer"  # [SPACE] tokens preserved when clean_up_tokenization_spaces=False (default)
         return input_text, output_text
 
     # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens
@@ -133,6 +140,9 @@ def test_rust_and_python_full_tokenizers(self):
 
     # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_padding
     def test_padding(self, max_length=15):
+        if not self.test_rust_tokenizer:
+            self.skipTest(reason="test_rust_tokenizer is set to False")
+
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
@@ -238,7 +248,7 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
                 sequence_1 = "This one too please."
                 encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
                 encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
+                encoded_sequence_dict = tokenizer(
                     sequence_0,
                     sequence_1,
                     add_special_tokens=True,
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index c0561165c8dc..5beab5419a10 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -17,22 +17,15 @@
 import tempfile
 import unittest
 
-from datasets import load_dataset
+from tokenizers import AddedToken
 
-from transformers import (
-    SPIECE_UNDERLINE,
-    AddedToken,
-    CodeLlamaTokenizer,
-    CodeLlamaTokenizerFast,
-)
-from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from transformers import CodeLlamaTokenizer
 from transformers.testing_utils import (
     get_tests_dir,
     nested_simplify,
     require_sentencepiece,
     require_tokenizers,
     require_torch,
-    slow,
 )
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -41,181 +34,91 @@
 SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
 
+# impoprt convert_slow_tokenizer
+
+
 @require_sentencepiece
 @require_tokenizers
 class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "hf-internal-testing/llama-code-tokenizer"
+    # TokenizerTesterMixin configuration
+    from_pretrained_id = ["hf-internal-testing/llama-code-tokenizer"]
     tokenizer_class = CodeLlamaTokenizer
-    rust_tokenizer_class = CodeLlamaTokenizerFast
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-    from_pretrained_kwargs = {}
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
+    integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '▁', '<0xF0>', '<0x9F>', '<0x98>', '<0x8A>', '<0x0A>', 'I', '▁was', '▁born', '▁in', '▁', '9', '2', '0', '0', '0', ',', '▁and', '▁this', '▁is', '▁f', 'als', 'é', '.', '<0x0A>', '生', '活', '的', '真', '<0xE8>', '<0xB0>', '<0x9B>', '是', '<0x0A>', 'Hi', '▁', '▁Hello', '<0x0A>', 'Hi', '▁▁', '▁Hello', '<0x0A>', '<0x0A>', '▁', '<0x0A>', '▁▁', '<0x0A>', '▁Hello', '<0x0A>', '', '<0x0A>', 'hi', '', 'there', '<0x0A>', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '<0x0A>', 'But', '▁', 'ird', '▁and', '▁', 'ป', 'ี', '▁▁▁', 'ird', '▁▁▁', 'ด', '<0x0A>', 'H', 'ey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [910, 338, 263, 1243, 29871, 243, 162, 155, 141, 13, 29902, 471, 6345, 297, 29871, 29929, 29906, 29900, 29900, 29900, 29892, 322, 445, 338, 285, 1338, 29948, 29889, 13, 30486, 31704, 30210, 30848, 235, 179, 158, 30392, 13, 18567, 29871, 15043, 13, 18567, 259, 15043, 13, 13, 29871, 13, 259, 13, 15043, 13, 1, 13, 2918, 1, 12711, 13, 1576, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889, 13, 6246, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718, 13, 29950, 1032, 920, 526, 366, 2599]  # fmt: skip
+    expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '▁', '<0xF0>', '<0x9F>', '<0x98>', '<0x8A>', '<0x0A>', 'I', '▁was', '▁born', '▁in', '▁', '9', '2', '0', '0', '0', ',', '▁and', '▁this', '▁is', '▁f', 'als', 'é', '.', '<0x0A>', '生', '活', '的', '真', '<0xE8>', '<0xB0>', '<0x9B>', '是', '<0x0A>', 'Hi', '▁', '▁Hello', '<0x0A>', 'Hi', '▁▁', '▁Hello', '<0x0A>', '<0x0A>', '▁', '<0x0A>', '▁▁', '<0x0A>', '▁Hello', '<0x0A>', '', '<0x0A>', 'hi', '', 'there', '<0x0A>', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '<0x0A>', 'But', '▁', 'ird', '▁and', '▁', 'ป', 'ี', '▁▁▁', 'ird', '▁▁▁', 'ด', '<0x0A>', 'H', 'ey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
+
+    def test_save_and_load_tokenizer(self):
+        """Override to handle non-deterministic vocabulary order from Rust tokenizer."""
+        # safety check on max_len default value so we are sure the test works
+        tokenizer = self.get_tokenizer()
+        self.assertNotEqual(tokenizer.model_max_length, 42)
+
+        # Now let's start the test
+        tokenizer = self.get_tokenizer()
+        # Isolate this from the other tests because we save additional tokens/etc
+        tmpdirname = tempfile.mkdtemp()
+
+        sample_text = " He is very happy, UNwant\u00e9d,running"
+        before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+        before_vocab = tokenizer.get_vocab()
+        tokenizer.save_pretrained(tmpdirname)
+
+        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+        after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+        after_vocab = after_tokenizer.get_vocab()
+        self.assertListEqual(before_tokens, after_tokens)
+
+        # Compare vocabularies in an order-independent way
+        # The Rust tokenizer returns vocabularies in non-deterministic order
+        # Some special tokens may be added during _post_init when loading, so we check that
+        # all tokens from before_vocab are in after_vocab with the same IDs
+        for token, token_id in before_vocab.items():
+            self.assertIn(token, after_vocab, f"Token '{token}' missing in after_vocab")
+            self.assertEqual(
+                after_vocab[token], token_id, f"Token '{token}' has different ID: {after_vocab[token]} != {token_id}"
+            )
+
+        shutil.rmtree(tmpdirname)
+
+        tokenizer = self.get_tokenizer(model_max_length=42)
+        # Isolate this from the other tests because we save additional tokens/etc
+        tmpdirname = tempfile.mkdtemp()
+
+        sample_text = " He is very happy, UNwant\u00e9d,running"
+        tokenizer.add_tokens(["bim", "bambam"])
+        extra_special_tokens = tokenizer.extra_special_tokens
+        extra_special_tokens.append("new_extra_special_token")
+        tokenizer.add_special_tokens(
+            {"extra_special_tokens": extra_special_tokens}, replace_extra_special_tokens=False
+        )
+        before_tokens = tokenizer.encode(sample_text, add_special_tokens=False)
+        before_vocab = tokenizer.get_vocab()
+        tokenizer.save_pretrained(tmpdirname)
+
+        after_tokenizer = tokenizer.__class__.from_pretrained(tmpdirname)
+        after_tokens = after_tokenizer.encode(sample_text, add_special_tokens=False)
+        after_vocab = after_tokenizer.get_vocab()
+        self.assertListEqual(before_tokens, after_tokens)
 
-        # We have a SentencePiece fixture for testing
-        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(cls.tmpdirname)
+        for token, token_id in before_vocab.items():
+            self.assertIn(token, after_vocab, f"Token '{token}' missing in after_vocab")
+            self.assertEqual(
+                after_vocab[token], token_id, f"Token '{token}' has different ID: {after_vocab[token]} != {token_id}"
+            )
 
-    def get_tokenizers(cls, **kwargs):
-        kwargs.update({"pad_token": ""})
-        return super().get_tokenizers(**kwargs)
+        self.assertIn("bim", after_vocab)
+        self.assertIn("bambam", after_vocab)
+        self.assertIn("new_extra_special_token", after_tokenizer.extra_special_tokens)
 
     def test_no_infilling_init(self):
         tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
         with self.assertRaises(ValueError):
             tokenizer.tokenize("This is  prefix")
 
-    def test_full_tokenizer(self):
-        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [285, 46, 10, 170, 382],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "",
-                ".",
-            ],
-        )
-
-    def test_save_pretrained(self):
-        self.tokenizers_list = [
-            (self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
-            (self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
-            (self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
-            (self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
-        ]
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
     @require_torch
     def test_batch_tokenization(self):
-        if not self.test_seq2seq:
-            self.skipTest(reason="test_seq2seq is False")
-
         tokenizers = self.get_tokenizers()
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -244,68 +147,26 @@ def test_batch_tokenization(self):
                 self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
                 self.assertNotIn("decoder_input_ids", batch_encoder_only)
 
-    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
-    def test_save_slow_from_fast_and_reload_fast(self):
-        pass
-
     def test_special_tokens_initialization(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
                 added_tokens = [AddedToken("", lstrip=True)]
 
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
+                tokenizer_r = self.get_tokenizer(pretrained_name, additional_special_tokens=added_tokens, **kwargs)
                 r_output = tokenizer_r.encode("Hey this is a  token")
 
                 special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0]
 
                 self.assertTrue(special_token_id in r_output)
 
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.get_rust_tokenizer(
-                        pretrained_name,
-                        additional_special_tokens=added_tokens,
-                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a  token")
-
-                    cr_output = tokenizer_cr.encode("Hey this is a  token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
 
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="hf-internal-testing/llama-code-tokenizer",
-            revision="6eb30c03ab6a9e2cdef4d523024909ec815ddb75",
-            padding=False,
-        )
-
-    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
-    def test_subword_regularization_tokenizer(self):
-        pass
-
-
-@require_torch
-@require_sentencepiece
 @require_tokenizers
 class LlamaIntegrationTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         checkpoint_name = "hf-internal-testing/llama-code-tokenizer"
         cls.tokenizer: CodeLlamaTokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint_name)
-        cls.rust_tokenizer = CodeLlamaTokenizerFast.from_pretrained(checkpoint_name)
+        cls.rust_tokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint_name)
         return cls
 
     @require_torch
@@ -327,10 +188,7 @@ def integration_tests(self):
         )
 
     def test_fast_special_tokens(self):
-        slow_tokenizer = self.tokenizer
         fast_tokenizer = self.rust_tokenizer
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [1, 319, 4559, 1243]
 
         fast_tokenizer.add_eos_token = False
         fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
@@ -340,46 +198,18 @@ def test_fast_special_tokens(self):
         fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
         assert fast == [1, 319, 4559, 1243, 2]
 
-        slow_tokenizer.add_eos_token = True
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [1, 319, 4559, 1243, 2]
-
-        fast_tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+        fast_tokenizer = CodeLlamaTokenizer.from_pretrained(
             "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
         )
         fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
         assert fast == [319, 4559, 1243, 2]
-
-        slow_tokenizer = CodeLlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
-        )
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [319, 4559, 1243, 2]
-
         self.tokenizer.add_eos_token = False
         self.rust_tokenizer.add_eos_token = False
 
-    @slow
-    def test_conversion(self):
-        # This is excruciatingly slow since it has to recreate the entire merge
-        # list from the original vocabulary in spm
-        self.rust_tokenizer.save_pretrained("./out")
-        with tempfile.TemporaryDirectory() as dirname:
-            self.rust_tokenizer.save_pretrained(dirname)
-
-            with open(os.path.join(dirname, "tokenizer.json")) as f:
-                old_serialized = f.read()
-
-        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
-        with tempfile.NamedTemporaryFile() as f:
-            new_tokenizer.save(f.name)
-            # Re-opening since `f` is in bytes.
-            new_serialized = open(f.name).read()
-            with open("out_tokenizer.json", "w") as g:
-                g.write(new_serialized)
-
-            self.assertEqual(old_serialized, new_serialized)
-
+    @unittest.skip(
+        "Skipped in v5 - CodeLlama tokenization differences related to SPM legacy flag and Metaspace handling. "
+        "CodeLlama always uses legacy=False (Metaspace pre_tokenizer, no normalizer)"
+    )
     def test_simple_encode_decode(self):
         pyth_tokenizer = self.tokenizer
         rust_tokenizer = self.rust_tokenizer
@@ -428,6 +258,10 @@ def test_simple_encode_decode(self):
         self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
         self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
 
+    @unittest.skip(
+        "Skipped in v5 - CodeLlama tokenization differences related to SPM legacy flag and Metaspace handling. "
+        "CodeLlama always uses legacy=False (Metaspace pre_tokenizer, no normalizer)"
+    )
     def test_no_differences_showcase(self):
         pyth_tokenizer = self.tokenizer
         rust_tokenizer = self.rust_tokenizer
@@ -448,22 +282,16 @@ def test_no_differences_showcase(self):
 
     def test_no_differences_decode(self):
         pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
 
         self.assertEqual(pyth_tokenizer.decode([869]), ".")
-        self.assertEqual(rust_tokenizer.decode([869]), ".")
 
         self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
-        self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
 
     def test_no_differences_special_tokens(self):
         pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
         self.assertEqual(pyth_tokenizer.encode(""), [1])
-        self.assertEqual(rust_tokenizer.encode(""), [1])
 
         self.assertEqual(pyth_tokenizer.encode(""), [1, 1])
-        self.assertEqual(rust_tokenizer.encode(""), [1, 1])
 
     @unittest.skipIf(
         os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
@@ -471,6 +299,7 @@ def test_no_differences_special_tokens(self):
     )
     def test_integration_test_xnli(self):
         import tqdm
+        from datasets import load_dataset
 
         pyth_tokenizer = self.tokenizer
         rust_tokenizer = self.rust_tokenizer
@@ -502,62 +331,22 @@ def test_integration_test_xnli(self):
 
                 self.assertEqual(decoded1, decoded2)
 
-    def test_special_token_special_word(self):
-        # the word inform should be split as ['in', 'form']
-        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf", legacy=False)
-        tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
-        out1 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        self.assertEqual(out1, "inform")
-        out2 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
-        )
-        # the added prefix token should not be decoded
-        self.assertEqual(out2, " inform")
-        input_ids = tokenizer.encode("inform", add_special_tokens=False)
-        self.assertEqual(input_ids, [29871, 32016, 262, 689])  # 29871 is the spiece underline, '▁'
-
-        out2 = tokenizer.decode(
-            tokenizer.encode("  inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
-        self.assertEqual(out2, "inform")
-
-        ### Let's make sure decoding does not add extra spaces here and there
-        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
-        # Since currently we always strip left and right of the token, results are as such
-        input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False)
-        self.assertEqual(input_ids, [1, 15043, 1, 3525])
-        tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False)
-        self.assertEqual(tokens, ["", "▁Hello", "", "how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " Hellohow")
-
-        # Let's make sure that if there are any spaces, we don't remove them!
-        input_ids = tokenizer.encode("  Hello how", add_special_tokens=False)
-        self.assertEqual(input_ids, [259, 1, 15043, 1, 920])
-        tokens = tokenizer.tokenize("  Hello how", add_special_tokens=False)
-        self.assertEqual(tokens, ["▁▁", "", "▁Hello", "", "▁how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "  Hello how")
-
     def test_fill_token(self):
-        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+        tokenizer = CodeLlamaTokenizer.from_pretrained(
             "codellama/CodeLlama-7b-hf", fill_token=None, prefix_token=None, suffix_token=None, middle_token=None
         )
-        tokenizer.encode_plus("Hey how are you").input_ids
+        tokenizer.encode("Hey how are you")
         tokenizer.fill_token = ""
         with self.assertRaises(ValueError):
             tokenizer.encode("Hey how  are you")
-            tokenizer.encode_plus("Hey how  are you", "mne too")
+            tokenizer.encode("Hey how  are you", "mne too")
             tokenizer.tokenize("Hey how are you", "mne too")
 
-        tokenizer = CodeLlamaTokenizerFast.from_pretrained(
+        tokenizer = CodeLlamaTokenizer.from_pretrained(
             "codellama/CodeLlama-7b-hf", revision="3773f63b4511b9e47a9a7ffc765eed7eb0169486"
         )
         tokenizer.encode("Hey how  are you")
-        tokenizer.encode_plus("Hey how  are you", "mne too")
+        tokenizer.encode("Hey how  are you", "mne too")
         tokenizer.tokenize("Hey how are you", "mne too")
 
     def test_spm_edge_cases(self):
@@ -611,30 +400,22 @@ def main():
 """,
         ]
         tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
-        tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
 
         formatted_prompt = tokenizer.tokenize(PROMPTS[0])
-        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
         prefix, suffix = PROMPTS[0].split("")
         self.assertEqual(formatted_prompt, tokenizer.tokenize(prefix, suffix))
-        self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(prefix, suffix))
 
         input_ids = tokenizer.encode(PROMPTS[0], add_special_tokens=False)
-        self.assertEqual(input_ids, tokenizer_fast.encode(PROMPTS[0], add_special_tokens=False))
 
         prefix, suffix = PROMPTS[0].split("")
         input_ids = tokenizer.encode(PROMPTS[0])
         self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
-        self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))
 
         # Adding suffix_first check for infilling tasks
         suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
-        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
         prefix, suffix = PROMPTS[0].split("")
         self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
-        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))
 
         prefix, suffix = PROMPTS[0].split("")
         suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
         self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
-        self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))
diff --git a/tests/models/codegen/test_tokenization_codegen.py b/tests/models/codegen/test_tokenization_codegen.py
index 5f850211d7ca..93bf710ea50a 100644
--- a/tests/models/codegen/test_tokenization_codegen.py
+++ b/tests/models/codegen/test_tokenization_codegen.py
@@ -1,324 +1,18 @@
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import os
-import re
 import unittest
 
-from transformers import CodeGenTokenizer, CodeGenTokenizerFast
-from transformers.models.codegen.tokenization_codegen import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin
+from tests.test_tokenization_common import TokenizerTesterMixin
+from transformers.models.codegen.tokenization_codegen import CodeGenTokenizer
+from transformers.testing_utils import (
+    require_tokenizers,
+)
 
 
 @require_tokenizers
 class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "Salesforce/codegen-350M-mono"
+    from_pretrained_id = ["Salesforce/codegen-350M-mono"]
     tokenizer_class = CodeGenTokenizer
-    rust_tokenizer_class = CodeGenTokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_kwargs = {"add_prefix_space": True}
-    test_seq2seq = False
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "",
-            "<|endoftext|>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        cls.special_tokens_map = {"unk_token": ""}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return CodeGenTokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return CodeGenTokenizerFast.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = CodeGenTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text, add_prefix_space=True)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-
-        sequence = "lower newer"
-
-        # Testing tokenization
-        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        # Testing conversion to ids without special tokens
-        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing conversion to ids with special tokens
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-        ids = tokenizer.encode(sequence, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing the unknown token
-        input_tokens = tokens + [rust_tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    @unittest.skip
-    def test_pretokenized_inputs(self, *args, **kwargs):
-        # It's very difficult to mix/test pretokenization with byte-level
-        # And get both CodeGen and Roberta to work at the same time (mostly an issue of adding a space before the string)
-        pass
-
-    def test_padding(self, max_length=15):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                # Simple input
-                s = "This is a simple input"
-                s2 = ["This is a simple input 1", "This is a simple input 2"]
-                p = ("This is a simple input", "This is a pair")
-                p2 = [
-                    ("This is a simple input 1", "This is a simple input 2"),
-                    ("This is a simple pair 1", "This is a simple pair 2"),
-                ]
-
-                # Simple input tests
-                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    s2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    p2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-    def test_padding_if_pad_token_set_slow(self):
-        tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, pad_token="")
-
-        # Simple input
-        s = "This is a simple input"
-        s2 = ["This is a simple input looooooooong", "This is a simple input"]
-        p = ("This is a simple input", "This is a pair")
-        p2 = [
-            ("This is a simple input loooooong", "This is a simple input"),
-            ("This is a simple pair loooooong", "This is a simple pair"),
-        ]
-
-        pad_token_id = tokenizer.pad_token_id
-
-        out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
-        out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
-        out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
-        out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
-
-        # s
-        # test single string max_length padding
-        self.assertEqual(out_s["input_ids"].shape[-1], 30)
-        self.assertTrue(pad_token_id in out_s["input_ids"])
-        self.assertTrue(0 in out_s["attention_mask"])
-
-        # s2
-        # test automatic padding
-        self.assertEqual(out_s2["input_ids"].shape[-1], 33)
-        # long slice doesn't have padding
-        self.assertFalse(pad_token_id in out_s2["input_ids"][0])
-        self.assertFalse(0 in out_s2["attention_mask"][0])
-        # short slice does have padding
-        self.assertTrue(pad_token_id in out_s2["input_ids"][1])
-        self.assertTrue(0 in out_s2["attention_mask"][1])
-
-        # p
-        # test single pair max_length padding
-        self.assertEqual(out_p["input_ids"].shape[-1], 60)
-        self.assertTrue(pad_token_id in out_p["input_ids"])
-        self.assertTrue(0 in out_p["attention_mask"])
-
-        # p2
-        # test automatic padding pair
-        self.assertEqual(out_p2["input_ids"].shape[-1], 52)
-        # long slice pair doesn't have padding
-        self.assertFalse(pad_token_id in out_p2["input_ids"][0])
-        self.assertFalse(0 in out_p2["attention_mask"][0])
-        # short slice pair does have padding
-        self.assertTrue(pad_token_id in out_p2["input_ids"][1])
-        self.assertTrue(0 in out_p2["attention_mask"][1])
-
-    def test_add_bos_token_slow(self):
-        bos_token = "$$$"
-        tokenizer = CodeGenTokenizer.from_pretrained(self.tmpdirname, bos_token=bos_token, add_bos_token=True)
-
-        s = "This is a simple input"
-        s2 = ["This is a simple input 1", "This is a simple input 2"]
-
-        bos_token_id = tokenizer.bos_token_id
-
-        out_s = tokenizer(s)
-        out_s2 = tokenizer(s2)
-
-        self.assertEqual(out_s.input_ids[0], bos_token_id)
-        self.assertTrue(all(o[0] == bos_token_id for o in out_s2.input_ids))
-
-        decode_s = tokenizer.decode(out_s.input_ids)
-        decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
-
-        self.assertTrue(decode_s.startswith(bos_token))
-        self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
-
-    @slow
-    def test_truncation(self):
-        tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
-
-        text = "\nif len_a > len_b:\n    result = a\nelse:\n    result = b\n\n\n\n#"
-        expected_truncated_text = "\nif len_a > len_b:\n      result = a\nelse:\n      result = b"
-
-        input_ids = tokenizer.encode(text)
-        truncation_pattern = ["^#", re.escape("<|endoftext|>"), "^'''", '^"""', "\n\n\n"]
-        decoded_text = tokenizer.decode(input_ids, truncate_before_pattern=truncation_pattern)
-        self.assertEqual(decoded_text, expected_truncated_text)
-        # TODO @ArthurZ outputs of the fast tokenizer are different in this case, un-related to the PR
-
-    # tokenizer has no padding token
-    @unittest.skip(reason="tokenizer has no padding token")
-    def test_padding_different_model_input_name(self):
-        pass
-
-    @slow
-    def test_tokenizer_integration(self):
-        # Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility.
-
-        sequences = [
-            "Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
-            "general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
-            "Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
-            "models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
-            "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
-            "conditioning on both left and right context in all layers.",
-            "The quick brown fox jumps over the lazy dog.",
-        ]
-
-        tokenizer_classes = [self.tokenizer_class]
-        if self.test_rust_tokenizer:
-            tokenizer_classes.append(self.rust_tokenizer_class)
-
-        # Test default case. i.e. return_token_type_ids is False.
-        for tokenizer_class in tokenizer_classes:
-            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono")
-
-            encoding = tokenizer(sequences)
-            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
-
-            # fmt: off
-            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
-            # fmt: on
-
-            encoding_data = encoding.data
-            self.assertDictEqual(encoding_data, expected_encoding)
-
-            for expected, decoded in zip(sequences, decoded_sequences):
-                self.assertEqual(expected, decoded)
-
-        # Test return_token_type_ids is True case.
-        for tokenizer_class in tokenizer_classes:
-            tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True)
-
-            encoding = tokenizer(sequences)
-            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
-
-            # fmt: off
-            expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
-            # fmt: on
-
-            encoding_data = encoding.data
-            self.assertDictEqual(encoding_data, expected_encoding)
 
-            for expected, decoded in zip(sequences, decoded_sequences):
-                self.assertEqual(expected, decoded)
+    integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', '  ', 'Hello', 'Ċ', 'Hi', '   ', 'Hello', 'ĊĊ', 'Ġ', 'Ċ', '  ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', '   ', 'ird', '   ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [1212, 318, 257, 1332, 30325, 232, 198, 40, 373, 4642, 287, 10190, 830, 11, 290, 428, 318, 27807, 2634, 13, 198, 37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468, 198, 17250, 50286, 15496, 198, 17250, 50285, 15496, 628, 220, 198, 50286, 198, 18435, 198, 27, 82, 29, 198, 5303, 27, 82, 29, 8117, 198, 464, 1708, 4731, 815, 307, 6105, 30240, 25, 18435, 13, 198, 1537, 220, 1447, 290, 220, 19567, 249, 19567, 113, 50285, 1447, 50285, 19567, 242, 198, 10814, 703, 389, 345, 1804]  # fmt: skip
+    expected_tokens_from_ids = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', '  ', 'Hello', 'Ċ', 'Hi', '   ', 'Hello', 'ĊĊ', 'Ġ', 'Ċ', '  ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', '   ', 'ird', '   ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
diff --git a/tests/models/cohere/test_tokenization_cohere.py b/tests/models/cohere/test_tokenization_cohere.py
index ce56bbeb6a84..b428c4fa9bca 100644
--- a/tests/models/cohere/test_tokenization_cohere.py
+++ b/tests/models/cohere/test_tokenization_cohere.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
 import unittest
 
-from transformers import CohereTokenizerFast
+from transformers import CohereTokenizer
 from transformers.testing_utils import (
     require_jinja,
     require_tokenizers,
@@ -27,11 +26,7 @@
 
 @require_tokenizers
 class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    slow_tokenizer_class = None
-    rust_tokenizer_class = CohereTokenizerFast
-    tokenizer_class = CohereTokenizerFast
-    test_rust_tokenizer = True
-    test_slow_tokenizer = False
+    tokenizer_class = CohereTokenizer
     from_pretrained_vocab_key = "tokenizer_file"
     from_pretrained_id = "hf-internal-testing/tiny-random-CohereForCausalLM"
     special_tokens_map = {
@@ -41,34 +36,21 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         "pad_token": "",
     }
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        tokenizer = CohereTokenizerFast.from_pretrained("hf-internal-testing/tiny-random-CohereForCausalLM")
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        _kwargs = copy.deepcopy(cls.special_tokens_map)
-        _kwargs.update(kwargs)
-        kwargs = _kwargs
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return CohereTokenizerFast.from_pretrained(pretrained_name, **kwargs)
+    integration_expected_tokens = ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est', 'Ġ', 'Ł', 'ĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġb', 'orn', 'Ġin', 'Ġ', '9', '2', '0', '0', '0', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠH', 'ell', 'o', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠH', 'ell', 'o', 'Ċ', 'Ċ', 'ĠĊ', 'Ġ', 'ĠĊ', 'ĠH', 'ell', 'o', 'Ċ', '<', 's', '>', 'Ċ', 'h', 'i', '<', 's', '>', 't', 'he', 're', 'Ċ', 'T', 'he', 'Ġfollow', 'ing', 'Ġst', 'r', 'ing', 'Ġsh', 'ould', 'Ġbe', 'Ġpro', 'per', 'ly', 'Ġen', 'c', 'od', 'ed', ':', 'ĠH', 'ell', 'o', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġh', 'ow', 'Ġare', 'Ġy', 'ou', 'Ġdo', 'ing']  # fmt: skip
+    integration_expected_token_ids = [60, 80, 223, 307, 204, 202, 333, 167, 199, 192, 178, 166, 49, 265, 227, 712, 229, 167, 33, 26, 24, 24, 24, 20, 233, 524, 307, 222, 632, 1018, 22, 166, 160, 188, 199, 159, 120, 127, 160, 194, 172, 160, 196, 199, 161, 116, 195, 159, 192, 115, 166, 48, 81, 167, 289, 420, 87, 166, 48, 81, 167, 167, 289, 420, 87, 166, 166, 259, 167, 259, 289, 420, 87, 166, 36, 91, 38, 166, 80, 81, 36, 91, 38, 92, 203, 210, 166, 60, 203, 765, 231, 292, 90, 231, 396, 458, 299, 348, 474, 271, 551, 75, 339, 212, 34, 289, 420, 87, 22, 166, 42, 293, 167, 813, 233, 167, 153, 124, 195, 153, 124, 121, 167, 167, 167, 813, 167, 167, 167, 153, 124, 188, 166, 48, 634, 240, 291, 394, 411, 243, 793, 231]  # fmt: skip
+    expected_tokens_from_ids = ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est', 'Ġ', 'Ł', 'ĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġb', 'orn', 'Ġin', 'Ġ', '9', '2', '0', '0', '0', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġf', 'als', 'é', '.', 'Ċ', 'ç', 'Ķ', 'Ł', 'æ', '´', '»', 'ç', 'ļ', 'Ħ', 'ç', 'ľ', 'Ł', 'è', '°', 'Ľ', 'æ', 'ĺ', '¯', 'Ċ', 'H', 'i', 'Ġ', 'ĠH', 'ell', 'o', 'Ċ', 'H', 'i', 'Ġ', 'Ġ', 'ĠH', 'ell', 'o', 'Ċ', 'Ċ', 'ĠĊ', 'Ġ', 'ĠĊ', 'ĠH', 'ell', 'o', 'Ċ', '<', 's', '>', 'Ċ', 'h', 'i', '<', 's', '>', 't', 'he', 're', 'Ċ', 'T', 'he', 'Ġfollow', 'ing', 'Ġst', 'r', 'ing', 'Ġsh', 'ould', 'Ġbe', 'Ġpro', 'per', 'ly', 'Ġen', 'c', 'od', 'ed', ':', 'ĠH', 'ell', 'o', '.', 'Ċ', 'B', 'ut', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à', '¸', 'Ľ', 'à', '¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à', '¸', 'Ķ', 'Ċ', 'H', 'ey', 'Ġh', 'ow', 'Ġare', 'Ġy', 'ou', 'Ġdo', 'ing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test ���\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
 
     # This gives CPU OOM on a single-gpu runner (~60G RAM). On multi-gpu runner, it has ~180G RAM which is enough.
     @require_torch_multi_accelerator
     def test_torch_encode_plus_sent_to_model(self):
         super().test_torch_encode_plus_sent_to_model()
 
-    @unittest.skip(reason="This needs a slow tokenizer. Cohere does not have one!")
-    def test_encode_decode_with_spaces(self):
-        return
-
     def test_encodings_from_sample_data(self):
         """
         Assert that the created tokens are the same than the hard-coded ones
         """
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
 
         INPUT_SENTENCES = ["The quick brown fox<|END_OF_TURN_TOKEN|>", "jumps over the lazy dog<|END_OF_TURN_TOKEN|>"]
         TARGET_TOKENS = [
@@ -76,79 +58,50 @@ def test_encodings_from_sample_data(self):
             [5, 82, 332, 88, 91, 544, 206, 257, 930, 97, 239, 435, 8],
         ]
 
-        computed_tokens = tokenizer.batch_encode_plus(INPUT_SENTENCES)["input_ids"]
+        computed_tokens = tokenizer(INPUT_SENTENCES)["input_ids"]
         self.assertListEqual(TARGET_TOKENS, computed_tokens)
 
         INPUT_SENTENCES_W_BOS = [
             "The quick brown fox<|END_OF_TURN_TOKEN|>",
             "jumps over the lazy dog<|END_OF_TURN_TOKEN|>",
         ]
-        decoded_tokens = tokenizer.batch_decode(computed_tokens)
+        decoded_tokens = tokenizer.decode(computed_tokens)
         self.assertListEqual(decoded_tokens, INPUT_SENTENCES_W_BOS)
 
-    def test_padding(self, max_length=10):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                # tokenizer_r.pad_token = None # Hotfixing padding = None
-                # Simple input
-                s = "This is a simple input"
-                s2 = ["This is a simple input 1", "This is a simple input 2"]
-                p = ("This is a simple input", "This is a pair")
-                p2 = [
-                    ("This is a simple input 1", "This is a simple input 2"),
-                    ("This is a simple pair 1", "This is a simple pair 2"),
-                ]
-
-                # Simple input tests
-                try:
-                    tokenizer_r.encode(s, max_length=max_length)
-                    tokenizer_r.encode_plus(s, max_length=max_length)
-
-                    tokenizer_r.batch_encode_plus(s2, max_length=max_length)
-                    tokenizer_r.encode(p, max_length=max_length)
-                    tokenizer_r.batch_encode_plus(p2, max_length=max_length)
-                except ValueError:
-                    self.fail("Cohere Tokenizer should be able to deal with padding")
-
-                tokenizer_r.pad_token = None  # Hotfixing padding = None
-                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    s2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    p2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
     def test_pretrained_model_lists(self):
         # No `max_model_input_sizes` for Cohere model
         self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
         self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)
 
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.get_tokenizer()
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8],
+            [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65,
+            59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8,
+            36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59,
+            45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61,
+            58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38,
+            54, 567, 235, 693, 276, 411, 243, 22, 8]
+        ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
     @require_jinja
     def test_tokenization_for_tool_use(self):
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
 
         conversation = [{"role": "user", "content": "Whats the biggest penguin in the world?"}]
 
@@ -219,7 +172,7 @@ def directly_answer() -> List[Dict]:
 
     @require_jinja
     def test_tokenization_for_grounded_generation(self):
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         conversation = [{"role": "user", "content": "Whats the biggest penguin in the world?"}]
 
         documents = [
@@ -264,8 +217,8 @@ def test_tokenization_for_grounded_generation(self):
         self.assertEqual(grounded_generation_prompt, expected_prompt)
 
     def test_add_prefix_space_fast(self):
-        tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
-        tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
+        tokenizer_w_prefix = self.get_tokenizer(add_prefix_space=True)
+        tokenizer_wo_prefix = self.get_tokenizer(add_prefix_space=False)
         tokens_w_prefix = tokenizer_w_prefix.tokenize("Hey")
         tokens_wo_prefix = tokenizer_wo_prefix.tokenize("Hey")
         self.assertNotEqual(tokens_w_prefix, tokens_wo_prefix)
diff --git a/tests/models/cpmant/test_tokenization_cpmant.py b/tests/models/cpmant/test_tokenization_cpmant.py
index e04d9504cb86..82699f39cdda 100644
--- a/tests/models/cpmant/test_tokenization_cpmant.py
+++ b/tests/models/cpmant/test_tokenization_cpmant.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 import os
+import shutil
+import tempfile
 import unittest
 
 from transformers.models.cpmant.tokenization_cpmant import VOCAB_FILES_NAMES, CpmAntTokenizer
@@ -31,6 +33,9 @@ class CPMAntTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     def setUpClass(cls):
         super().setUpClass()
 
+        old_tmpdirname = cls.tmpdirname
+        cls.tmpdirname = tempfile.mkdtemp()
+
         vocab_tokens = [
             "",
             "",
@@ -53,6 +58,8 @@ def setUpClass(cls):
         with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
+        shutil.rmtree(old_tmpdirname, ignore_errors=True)
+
     @tooslow
     def test_pre_tokenization(self):
         tokenizer = CpmAntTokenizer.from_pretrained("openbmb/cpm-ant-10b")
diff --git a/tests/models/deberta/test_tokenization_deberta.py b/tests/models/deberta/test_tokenization_deberta.py
index c55e995ab500..c5ff48e5a284 100644
--- a/tests/models/deberta/test_tokenization_deberta.py
+++ b/tests/models/deberta/test_tokenization_deberta.py
@@ -13,158 +13,136 @@
 # limitations under the License.
 
 
-import json
-import os
 import unittest
 
-from transformers import DebertaTokenizer, DebertaTokenizerFast
-from transformers.models.deberta.tokenization_deberta import VOCAB_FILES_NAMES
-from transformers.testing_utils import slow
+from transformers import DebertaTokenizer
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
 class DebertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/deberta-base"
+    from_pretrained_id = ["microsoft/deberta-base"]
     tokenizer_class = DebertaTokenizer
-    test_rust_tokenizer = True
-    rust_tokenizer_class = DebertaTokenizerFast
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "[UNK]",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        cls.special_tokens_map = {"unk_token": "[UNK]"}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
-        text = "lower newer"
-        bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_token_type_ids(self):
-        tokenizer = self.get_tokenizer()
-        tokd = tokenizer("Hello", "World")
-        expected_token_type_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
-        self.assertListEqual(tokd["token_type_ids"], expected_token_type_ids)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("microsoft/deberta-base")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_text_from_decode = tokenizer.encode(
-            "sequence builders", add_special_tokens=True, add_prefix_space=False
-        )
-        encoded_pair_from_decode = tokenizer.encode(
-            "sequence builders", "multi-sequence build", add_special_tokens=True, add_prefix_space=False
-        )
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == encoded_text_from_decode
-        assert encoded_pair == encoded_pair_from_decode
-
-    @slow
-    def test_tokenizer_integration(self):
-        tokenizer_classes = [self.tokenizer_class]
-        if self.test_rust_tokenizer:
-            tokenizer_classes.append(self.rust_tokenizer_class)
-
-        for tokenizer_class in tokenizer_classes:
-            tokenizer = tokenizer_class.from_pretrained("microsoft/deberta-base")
-
-            sequences = [
-                "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
-                "ALBERT incorporates two parameter reduction techniques",
-                "The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
-                " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
-                " vocabulary embedding.",
-            ]
-
-            encoding = tokenizer(sequences, padding=True)
-            decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
-
-            # fmt: off
-            expected_encoding = {
-                'input_ids': [
-                    [1, 2118, 11126, 565, 35, 83, 25191, 163, 18854, 13, 12156, 12, 16101, 25376, 13807, 9, 22205, 27893, 1635, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 2118, 11126, 565, 24536, 80, 43797, 4878, 7373, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 133, 78, 65, 16, 10, 3724, 1538, 33183, 11303, 43797, 1938, 4, 870, 24165, 29105, 5, 739, 32644, 33183, 11303, 36173, 88, 80, 650, 7821, 45940, 6, 52, 2559, 5, 1836, 9, 5, 7397, 13171, 31, 5, 1836, 9, 32644, 33183, 11303, 4, 2]
-                ],
-                'token_type_ids': [
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-                ],
-                'attention_mask': [
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-                ]
-            }
-            # fmt: on
-
-            expected_decoded_sequence = [
-                "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
-                "ALBERT incorporates two parameter reduction techniques",
-                "The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
-                " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
-                " vocabulary embedding.",
-            ]
-
-            self.assertDictEqual(encoding.data, expected_encoding)
-
-            for expected, decoded in zip(expected_decoded_sequence, decoded_sequences):
-                self.assertEqual(expected, decoded)
+    integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'Ġ', 'Ġ', 'ĠHello', 'ĊĊ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [713, 16, 10, 1296, 17841, 27969, 50118, 100, 21, 2421, 11, 8403, 151, 6, 8, 42, 16, 22461, 1140, 4, 50118, 48998, 37127, 20024, 2023, 44574, 49122, 4333, 36484, 7487, 3726, 48569, 50118, 30086, 1437, 20920, 50118, 30086, 1437, 1437, 20920, 50140, 1437, 50118, 1437, 1437, 50118, 20920, 50118, 41552, 29, 15698, 50118, 3592, 41552, 29, 15698, 8585, 50118, 133, 511, 6755, 197, 28, 5083, 45320, 35, 20920, 4, 50118, 1708, 1437, 8602, 8, 1437, 24107, 3726, 24107, 8906, 1437, 1437, 1437, 8602, 1437, 1437, 1437, 24107, 10674, 50118, 13368, 141, 32, 47, 608]  # fmt: skip
+    expected_tokens_from_ids = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'Ġ', 'Ġ', 'ĠHello', 'ĊĊ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
+
+    # @classmethod
+    # def setUpClass(cls):
+    #     super().setUpClass()
+
+    #     # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
+    #     vocab = [
+    #         "l",
+    #         "o",
+    #         "w",
+    #         "e",
+    #         "r",
+    #         "s",
+    #         "t",
+    #         "i",
+    #         "d",
+    #         "n",
+    #         "\u0120",
+    #         "\u0120l",
+    #         "\u0120n",
+    #         "\u0120lo",
+    #         "\u0120low",
+    #         "er",
+    #         "\u0120lowest",
+    #         "\u0120newer",
+    #         "\u0120wider",
+    #         "[UNK]",
+    #     ]
+    #     vocab_tokens = dict(zip(vocab, range(len(vocab))))
+    #     # merges as list of tuples, matching what load_merges returns
+    #     merges = [("\u0120", "l"), ("\u0120l", "o"), ("\u0120lo", "w"), ("e", "r")]
+    #     cls.special_tokens_map = {"unk_token": "[UNK]"}
+
+    #     cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+    #     cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
+    #     with open(cls.vocab_file, "w", encoding="utf-8") as fp:
+    #         fp.write(json.dumps(vocab_tokens) + "\n")
+    #     with open(cls.merges_file, "w", encoding="utf-8") as fp:
+    #         # Write merges file in the standard format
+    #         fp.write("#version: 0.2\n")
+    #         fp.write("\n".join([f"{a} {b}" for a, b in merges]))
+
+    #     tokenizer = DebertaTokenizer(vocab=vocab_tokens, merges=merges)
+    #     tokenizer.save_pretrained(cls.tmpdirname)
+
+    #     cls.tokenizers = [tokenizer]
+
+    # @classmethod
+    # def get_tokenizer(cls, pretrained_name=None, **kwargs):
+    #     kwargs.update(cls.special_tokens_map)
+    #     pretrained_name = pretrained_name or cls.tmpdirname
+    #     return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+    # def get_input_output_texts(self, tokenizer):
+    #     input_text = "lower newer"
+    #     output_text = "lower newer"
+    #     return input_text, output_text
+
+    # def test_full_tokenizer(self):
+    #     tokenizer = self.get_tokenizer()
+    #     text = "lower newer"
+    #     bpe_tokens = ["l", "o", "w", "er", "\u0120", "n", "e", "w", "er"]
+    #     tokens = tokenizer.tokenize(text)
+    #     self.assertListEqual(tokens, bpe_tokens)
+
+    #     input_tokens = tokens + [tokenizer.unk_token]
+    #     input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
+    #     self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    # def test_tokenizer_integration(self):
+    #     tokenizer_classes = [self.tokenizer_class]
+    #     if self.test_rust_tokenizer:
+    #         tokenizer_classes.append(self.rust_tokenizer_class)
+
+    #     for tokenizer_class in tokenizer_classes:
+    #         tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")
+
+    #         sequences = [
+    #             "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
+    #             "ALBERT incorporates two parameter reduction techniques",
+    #             "The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
+    #             " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
+    #             " vocabulary embedding.",
+    #         ]
+    #         encoding = tokenizer(sequences, padding=True)
+    #         decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
+
+    #         # fmt: off
+    #         expected_encoding = {
+    #             'input_ids': [
+    #                 [1, 2118, 11126, 565, 35, 83, 25191, 163, 18854, 13, 12156, 12, 16101, 25376, 13807, 9, 22205, 27893, 1635, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [1, 2118, 11126, 565, 24536, 80, 43797, 4878, 7373, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [1, 133, 78, 65, 16, 10, 3724, 1538, 33183, 11303, 43797, 1938, 4, 870, 24165, 29105, 5, 739, 32644, 33183, 11303, 36173, 88, 80, 650, 7821, 45940, 6, 52, 2559, 5, 1836, 9, 5, 7397, 13171, 31, 5, 1836, 9, 32644, 33183, 11303, 4, 2]
+    #             ],
+    #             'token_type_ids': [
+    #                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    #             ],
+    #             'attention_mask': [
+    #                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    #                 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    #             ]
+    #         }
+    #         # fmt: on
+
+    #         expected_decoded_sequence = [
+    #             "ALBERT: A Lite BERT for Self-supervised Learning of Language Representations",
+    #             "ALBERT incorporates two parameter reduction techniques",
+    #             "The first one is a factorized embedding parameterization. By decomposing the large vocabulary"
+    #             " embedding matrix into two small matrices, we separate the size of the hidden layers from the size of"
+    #             " vocabulary embedding.",
+    #         ]
+
+    #         #  self.assertDictEqual(encoding.data, expected_encoding)
+
+    #         for expected, decoded in zip(expected_decoded_sequence, decoded_sequences):
+    #             self.assertEqual(expected, decoded)
diff --git a/tests/models/deberta_v2/test_tokenization_deberta_v2.py b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
index e629f279249c..1b4fc5e79386 100644
--- a/tests/models/deberta_v2/test_tokenization_deberta_v2.py
+++ b/tests/models/deberta_v2/test_tokenization_deberta_v2.py
@@ -14,8 +14,9 @@
 
 import unittest
 
-from transformers import DebertaV2Tokenizer, DebertaV2TokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers import DebertaV2Tokenizer
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -28,40 +29,11 @@
 class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "microsoft/deberta-v2-xlarge"
     tokenizer_class = DebertaV2Tokenizer
-    rust_tokenizer_class = DebertaV2TokenizerFast
-    test_sentencepiece = True
-    test_sentencepiece_ignore_case = True
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="")
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-        self.assertEqual(vocab_keys[0], "")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "[PAD]")
-        self.assertEqual(len(vocab_keys), 30_001)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
+    integration_expected_tokens = ['▁This', '▁is', '▁a', '▁test', '▁😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', 'é', '.', '▁', '生', '活', '的', '真', '谛', '是', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '▁<', 's', '>', '▁hi', '<', 's', '>', 'there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '▁But', '▁i', 'rd', '▁and', '▁', 'ป', 'ี', '▁i', 'rd', '▁', 'ด', '▁Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [69, 13, 10, 711, 112100, 16, 28, 1022, 11, 728, 16135, 6, 7, 32, 13, 46426, 12, 5155, 4, 250, 40289, 102080, 8593, 98226, 3, 29213, 2302, 4800, 2302, 4800, 4800, 2318, 12, 2259, 8133, 9475, 12, 2259, 7493, 23, 524, 3664, 146, 26, 2141, 23085, 43, 4800, 4, 167, 306, 1893, 7, 250, 86501, 70429, 306, 1893, 250, 51857, 4839, 100, 24, 17, 381]  # fmt: skip
+    expected_tokens_from_ids = ['▁This', '▁is', '▁a', '▁test', '▁😊', '▁I', '▁was', '▁born', '▁in', '▁9', '2000', ',', '▁and', '▁this', '▁is', '▁fal', 's', 'é', '.', '▁', '生', '活', '的', '真', '[UNK]', '是', '▁Hi', '▁Hello', '▁Hi', '▁Hello', '▁Hello', '▁<', 's', '>', '▁hi', '<', 's', '>', 'there', '▁The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '▁But', '▁i', 'rd', '▁and', '▁', 'ป', 'ี', '▁i', 'rd', '▁', 'ด', '▁Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊 I was born in 92000, and this is falsé. 生活的真[UNK]是 Hi Hello Hi Hello Hello  hithere The following string should be properly encoded: Hello. But ird and ปี ird ด Hey how are you doing"
 
     def test_do_lower_case(self):
         # fmt: off
@@ -69,194 +41,79 @@ def test_do_lower_case(self):
         tokens_target = ["▁hello", "!", "how", "▁are", "▁you", "?"]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True)
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(vocab=vocab_scores, unk_token="", do_lower_case=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", do_lower_case=True)
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(rust_tokens, tokens_target)
-
-    @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
-    def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
-        pass
-
-    @unittest.skip(reason="There is an inconsistency between slow and fast tokenizer due to a bug in the fast one.")
-    def test_sentencepiece_tokenize_and_decode(self):
-        pass
-
     def test_split_by_punct(self):
         # fmt: off
         sequence = "I was born in 92000, and this is falsé!"
         tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", split_by_punct=True)
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(vocab=vocab_scores, merges=merges, unk_token="", split_by_punct=True)
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", split_by_punct=True)
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(rust_tokens, tokens_target)
-
     def test_do_lower_case_split_by_punct(self):
         # fmt: off
         sequence = "I was born in 92000, and this is falsé!"
         tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True)
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(
+            vocab=vocab_scores, merges=merges, unk_token="", do_lower_case=True, split_by_punct=True
+        )
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(
-            SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=True
-        )
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-        self.assertListEqual(rust_tokens, tokens_target)
-
     def test_do_lower_case_split_by_punct_false(self):
         # fmt: off
         sequence = "I was born in 92000, and this is falsé!"
         tokens_target = ["▁i", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ]
         # fmt: on
 
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False)
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(
+            vocab=vocab_scores, merges=merges, unk_token="", do_lower_case=True, split_by_punct=False
+        )
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(
-            SAMPLE_VOCAB, unk_token="", do_lower_case=True, split_by_punct=False
-        )
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(rust_tokens, tokens_target)
-
     def test_do_lower_case_false_split_by_punct(self):
         # fmt: off
         sequence = "I was born in 92000, and this is falsé!"
         tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", "▁", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "▁", "!", ]
         # fmt: on
-
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True)
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(
+            vocab=vocab_scores, merges=merges, unk_token="", do_lower_case=False, split_by_punct=True
+        )
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
 
         self.assertListEqual(tokens, tokens_target)
 
-        rust_tokenizer = DebertaV2TokenizerFast(
-            SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=True
-        )
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(rust_tokens, tokens_target)
-
     def test_do_lower_case_false_split_by_punct_false(self):
         # fmt: off
         sequence = " \tHeLLo!how  \n Are yoU?  "
         tokens_target = ["▁", "", "e", "", "o", "!", "how", "▁", "", "re", "▁yo", "", "?"]
         # fmt: on
-
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False)
-        tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(tokens, tokens_target)
-
-        rust_tokenizer = DebertaV2TokenizerFast(
-            SAMPLE_VOCAB, unk_token="", do_lower_case=False, split_by_punct=False
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab, vocab_scores, merges = extractor.extract()
+        tokenizer = DebertaV2Tokenizer(
+            vocab=vocab_scores, merges=merges, unk_token="", do_lower_case=False, split_by_punct=False
         )
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-
-        self.assertListEqual(rust_tokens, tokens_target)
-
-    def test_rust_and_python_full_tokenizers(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé!"
-
         tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
-        rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
-        self.assertListEqual(tokens, rust_tokens)
 
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        sequence = "This is a test"
-        ids_target = [13, 1, 4398, 25, 21, 1289]
-        tokens_target = ["▁", "T", "his", "▁is", "▁a", "▁test"]
-        back_tokens_target = ["▁", "", "his", "▁is", "▁a", "▁test"]
-
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB, unk_token="", keep_accents=True)
-        rust_tokenizer = DebertaV2TokenizerFast(SAMPLE_VOCAB, unk_token="", keep_accents=True)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, ids_target)
-        tokens = tokenizer.tokenize(sequence)
         self.assertListEqual(tokens, tokens_target)
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, back_tokens_target)
-
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(rust_ids, ids_target)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(rust_tokens, tokens_target)
-        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
-        self.assertListEqual(rust_back_tokens, back_tokens_target)
-
-        # fmt: off
-        sequence = "I was born in 92000, and this is falsé!"
-        ids_target = [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 187]
-        tokens_target = ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "!", ]
-        back_tokens_target = ["▁", "", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "", "!", ]
-        # fmt: on
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, ids_target)
-        tokens = tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, tokens_target)
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(back_tokens, back_tokens_target)
-
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(rust_ids, ids_target)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(rust_tokens, tokens_target)
-        rust_back_tokens = rust_tokenizer.convert_ids_to_tokens(rust_ids)
-        self.assertListEqual(rust_back_tokens, back_tokens_target)
-
-    def test_sequence_builders(self):
-        tokenizer = DebertaV2Tokenizer(SAMPLE_VOCAB)
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        self.assertEqual([tokenizer.cls_token_id] + text + [tokenizer.sep_token_id], encoded_sentence)
-        self.assertEqual(
-            [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [tokenizer.sep_token_id],
-            encoded_pair,
-        )
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[1, 39867, 36, 19390, 486, 27, 35052, 81436, 18, 60685, 1225, 7, 35052, 81436, 18, 9367, 16899, 18, 15937, 53, 594, 773, 18, 16287, 30465, 36, 15937, 6, 41139, 38, 36979, 60763, 191, 6, 34132, 99, 6, 50538, 390, 43230, 6, 34132, 2779, 20850, 14, 699, 1072, 1194, 36, 382, 10901, 53, 7, 699, 1072, 2084, 36, 20422, 630, 53, 19, 105, 3049, 1896, 1053, 16899, 1506, 11, 37978, 4243, 7, 1237, 31869, 200, 16566, 654, 6, 35052, 81436, 7, 55630, 13593, 4, 2], [1, 26, 15011, 13, 667, 8, 1053, 18, 23611, 1237, 72356, 12820, 34, 104134, 1209, 35, 13313, 6627, 21, 202, 347, 7, 164, 2399, 11, 46, 4485, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 5, 1232, 2864, 15785, 14951, 105, 5, 8581, 1250, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="microsoft/deberta-v2-xlarge",
-            revision="ad6e42c1532ddf3a15c39246b63f5559d558b670",
-        )
diff --git a/tests/models/dia/test_tokenization_dia.py b/tests/models/dia/test_tokenization_dia.py
index 4ade611f68e8..3fb23cad8f04 100644
--- a/tests/models/dia/test_tokenization_dia.py
+++ b/tests/models/dia/test_tokenization_dia.py
@@ -29,6 +29,7 @@
 class DiaTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = DiaTokenizer
     test_rust_tokenizer = False
+    from_pretrained_id = "AntonV/Dia-1.6B"
 
     @classmethod
     def setUpClass(cls):
diff --git a/tests/models/distilbert/test_tokenization_distilbert.py b/tests/models/distilbert/test_tokenization_distilbert.py
index cf92b48a3d52..12ec82b611ca 100644
--- a/tests/models/distilbert/test_tokenization_distilbert.py
+++ b/tests/models/distilbert/test_tokenization_distilbert.py
@@ -13,30 +13,28 @@
 # limitations under the License.
 
 
-from transformers import DistilBertTokenizer, DistilBertTokenizerFast
-from transformers.testing_utils import require_tokenizers, slow
+from transformers import AutoTokenizer
+from transformers.models.distilbert.tokenization_distilbert import DistilBertTokenizer
+from transformers.testing_utils import require_tokenizers
 
 from ..bert import test_tokenization_bert
 
 
+# TODO: Ita remove this test file?
 @require_tokenizers
 class DistilBertTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DistilBertTokenizer
-    rust_tokenizer_class = DistilBertTokenizerFast
-    test_rust_tokenizer = True
+    rust_tokenizer_class = DistilBertTokenizer
+    test_rust_tokenizer = False
     from_pretrained_id = "distilbert/distilbert-base-uncased"
 
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
 
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+        from_pretrained_id = "distilbert/distilbert-base-uncased"
 
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+        tok_auto = AutoTokenizer.from_pretrained(from_pretrained_id)
+        tok_auto.save_pretrained(cls.tmpdirname)
 
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
+        cls.tokenizers = [tok_auto]
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index ab1401b29519..97c2d95443d2 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -48,6 +48,7 @@ class DPRReaderTokenizationTest(test_tokenization_bert.BertTokenizationTest):
     tokenizer_class = DPRReaderTokenizer
     rust_tokenizer_class = DPRReaderTokenizerFast
     test_rust_tokenizer = True
+    test_seq2seq = False
     from_pretrained_id = "facebook/dpr-ctx_encoder-single-nq-base"
 
     @slow
diff --git a/tests/models/electra/test_tokenization_electra.py b/tests/models/electra/test_tokenization_electra.py
deleted file mode 100644
index f2ac66e21ae9..000000000000
--- a/tests/models/electra/test_tokenization_electra.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from transformers import ElectraTokenizerFast
-from transformers.models.electra.tokenization_electra import (
-    VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    ElectraTokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
-)
-from transformers.testing_utils import require_tokenizers, slow
-
-from ...test_tokenization_common import TokenizerTesterMixin, filter_non_english
-
-
-@require_tokenizers
-class ElectraTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/electra-small-generator"
-    tokenizer_class = ElectraTokenizer
-    rust_tokenizer_class = ElectraTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-    from_pretrained_filter = filter_non_english
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "[PAD]",
-            "[MASK]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # With lower casing
-        tokenizer = self.get_tokenizer(do_lower_case=True)
-        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
-
-        sequence = "UNwant\u00e9d,running"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
-
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
-
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
-
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
-
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
-
-    def test_clean_text(self):
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
-        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
-
-        self.assertListEqual(
-            [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
-        )
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("google/electra-base-discriminator")
-
-        text = tokenizer.encode("sequence builders", add_special_tokens=False)
-        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [101] + text + [102]
-        assert encoded_pair == [101] + text + [102] + text_2 + [102]
-
-    def test_offsets_with_special_characters(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
-                tokens = tokenizer_r.encode_plus(
-                    sentence,
-                    return_attention_mask=False,
-                    return_token_type_ids=False,
-                    return_offsets_mapping=True,
-                    add_special_tokens=True,
-                )
-
-                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
-                expected_results = (
-                    [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "A"),
-                        ((1, 2), ","),
-                        ((3, 5), "na"),
-                        ((5, 6), "##ï"),
-                        ((6, 8), "##ve"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "Allen"),
-                        ((21, 23), "##NL"),
-                        ((23, 24), "##P"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                    if not do_lower_case
-                    else [
-                        ((0, 0), tokenizer_r.cls_token),
-                        ((0, 1), "a"),
-                        ((1, 2), ","),
-                        ((3, 8), "naive"),
-                        ((9, 15), tokenizer_r.mask_token),
-                        ((16, 21), "allen"),
-                        ((21, 23), "##nl"),
-                        ((23, 24), "##p"),
-                        ((25, 33), "sentence"),
-                        ((33, 34), "."),
-                        ((0, 0), tokenizer_r.sep_token),
-                    ]
-                )
-
-                self.assertEqual(
-                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
-                )
-                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
-
-    def test_change_tokenize_chinese_chars(self):
-        list_of_common_chinese_char = ["的", "人", "有"]
-        text_with_chinese_char = "".join(list_of_common_chinese_char)
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                kwargs["tokenize_chinese_chars"] = True
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that each Chinese character is not preceded by "##"
-                self.assertListEqual(tokens_without_spe_char_p, list_of_common_chinese_char)
-                self.assertListEqual(tokens_without_spe_char_r, list_of_common_chinese_char)
-
-                kwargs["tokenize_chinese_chars"] = False
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                ids_without_spe_char_r = tokenizer_r.encode(text_with_chinese_char, add_special_tokens=False)
-                ids_without_spe_char_p = tokenizer_p.encode(text_with_chinese_char, add_special_tokens=False)
-
-                tokens_without_spe_char_r = tokenizer_r.convert_ids_to_tokens(ids_without_spe_char_r)
-                tokens_without_spe_char_p = tokenizer_p.convert_ids_to_tokens(ids_without_spe_char_p)
-
-                # it is expected that only the first Chinese character is not preceded by "##".
-                expected_tokens = [
-                    f"##{token}" if idx != 0 else token for idx, token in enumerate(list_of_common_chinese_char)
-                ]
-                self.assertListEqual(tokens_without_spe_char_p, expected_tokens)
-                self.assertListEqual(tokens_without_spe_char_r, expected_tokens)
diff --git a/tests/models/esm/test_tokenization_esm.py b/tests/models/esm/test_tokenization_esm.py
index 24a2353e3685..87b419cc0bd9 100644
--- a/tests/models/esm/test_tokenization_esm.py
+++ b/tests/models/esm/test_tokenization_esm.py
@@ -19,7 +19,7 @@
 
 from transformers.models.esm.tokenization_esm import VOCAB_FILES_NAMES, EsmTokenizer
 from transformers.testing_utils import require_tokenizers
-from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_python import PreTrainedTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 
@@ -101,13 +101,13 @@ def test_add_tokens(self):
 
         self.assertEqual(tokenizer.add_special_tokens({}), 0)
         self.assertEqual(tokenizer.add_special_tokens({"bos_token": "[BOS]", "eos_token": "[EOS]"}), 2)
-        self.assertRaises(AssertionError, tokenizer.add_special_tokens, {"additional_special_tokens": ""})
+        self.assertRaises(ValueError, tokenizer.add_special_tokens, {"additional_special_tokens": ""})
         self.assertEqual(tokenizer.add_special_tokens({"additional_special_tokens": [""]}), 1)
         self.assertEqual(
             tokenizer.add_special_tokens({"additional_special_tokens": ["", ""]}), 2
         )
-        self.assertIn("", tokenizer.special_tokens_map["additional_special_tokens"])
-        self.assertIsInstance(tokenizer.special_tokens_map["additional_special_tokens"], list)
-        self.assertGreaterEqual(len(tokenizer.special_tokens_map["additional_special_tokens"]), 2)
+        self.assertIn("", tokenizer.extra_special_tokens)
+        self.assertIsInstance(tokenizer.extra_special_tokens, list)
+        self.assertEqual(len(tokenizer.extra_special_tokens), 2)
 
         self.assertEqual(len(tokenizer), vocab_size + 8)
diff --git a/tests/models/evolla/test_processing_evolla.py b/tests/models/evolla/test_processing_evolla.py
index cafbb49661f3..1f3ca2135da0 100644
--- a/tests/models/evolla/test_processing_evolla.py
+++ b/tests/models/evolla/test_processing_evolla.py
@@ -144,6 +144,8 @@ def prepare_input_and_expected_output(self):
         return protein_dict, message, expected_output
 
     def get_protein_tokenizer(self, **kwargs):
+        if "fix_mistral_regex" not in kwargs:
+            kwargs["fix_mistral_regex"] = True
         return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).protein_tokenizer
 
     def prepare_inputs_single(self):
diff --git a/tests/models/flaubert/test_tokenization_flaubert.py b/tests/models/flaubert/test_tokenization_flaubert.py
index 30c65349883b..df79eb5cdb2e 100644
--- a/tests/models/flaubert/test_tokenization_flaubert.py
+++ b/tests/models/flaubert/test_tokenization_flaubert.py
@@ -15,6 +15,7 @@
 
 import json
 import os
+import tempfile
 import unittest
 
 from transformers import FlaubertTokenizer
@@ -29,26 +30,24 @@ class FlaubertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     tokenizer_class = FlaubertTokenizer
     test_rust_tokenizer = False
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
+    # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
+    def test_full_tokenizer(self):
+        """Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt"""
 
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w", "r", "t", "i", "lo", "low", "ne", "new", "er", "low", "lowest", "new", "newer", "wider", ""]  # fmt: skip
 
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["n e 300", "ne w 301", "e r 302", ""]
 
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vocab_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["vocab_file"])
+            merges_file = os.path.join(tmpdir, VOCAB_FILES_NAMES["merges_file"])
+            with open(vocab_file, "w", encoding="utf-8") as fp:
+                fp.write(json.dumps(vocab_tokens) + "\n")
+            with open(merges_file, "w", encoding="utf-8") as fp:
+                fp.write("\n".join(merges))
+            tokenizer = FlaubertTokenizer(vocab_file, merges_file)
 
-    # Copied from transformers.tests.models.xlm.test_tokenization_xlm.XLMTokenizationTest.test_full_tokenizer
-    def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
         text = "lower newer"
         bpe_tokens = ["l", "o", "w", "er", "new", "er"]
         tokens = tokenizer.tokenize(text)
diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py
index fba6584a80c2..94eee5c66bf8 100644
--- a/tests/models/fnet/test_modeling_fnet.py
+++ b/tests/models/fnet/test_modeling_fnet.py
@@ -37,7 +37,7 @@
         FNetForSequenceClassification,
         FNetForTokenClassification,
         FNetModel,
-        FNetTokenizerFast,
+        FNetTokenizer,
     )
 
 
@@ -459,7 +459,7 @@ def test_inference_for_masked_lm(self):
     @slow
     @require_tokenizers
     def test_inference_long_sentence(self):
-        tokenizer = FNetTokenizerFast.from_pretrained("google/fnet-base")
+        tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
 
         inputs = tokenizer(
             "the man worked as a [MASK].",
diff --git a/tests/models/fnet/test_tokenization_fnet.py b/tests/models/fnet/test_tokenization_fnet.py
deleted file mode 100644
index 3efb764e18fd..000000000000
--- a/tests/models/fnet/test_tokenization_fnet.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright 2019 Hugging Face inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from transformers import FNetTokenizer, FNetTokenizerFast
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow, tooslow
-from transformers.tokenization_utils import AddedToken
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/spiece.model")
-
-
-@require_sentencepiece
-@require_tokenizers
-class FNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/fnet-base"
-    tokenizer_class = FNetTokenizer
-    rust_tokenizer_class = FNetTokenizerFast
-    test_rust_tokenizer = True
-    test_sentencepiece = True
-    test_sentencepiece_ignore_case = True
-    test_seq2seq = False
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # We have a SentencePiece fixture for testing
-        tokenizer = FNetTokenizer(SAMPLE_VOCAB)
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "this is a test"
-        output_text = "this is a test"
-        return input_text, output_text
-
-    def test_convert_token_and_id(self):
-        """Test ``_convert_token_to_id`` and ``_convert_id_to_token``."""
-        token = ""
-        token_id = 0
-
-        self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id)
-        self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token)
-
-    def test_get_vocab(self):
-        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
-
-        self.assertEqual(vocab_keys[0], "")
-        self.assertEqual(vocab_keys[1], "")
-        self.assertEqual(vocab_keys[-1], "▁eloquent")
-        self.assertEqual(len(vocab_keys), 30_000)
-
-    def test_vocab_size(self):
-        self.assertEqual(self.get_tokenizer().vocab_size, 30_000)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "I was born in 92000, and this is falsé."
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_full_tokenizer(self):
-        tokenizer = FNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁", "T", "his", "▁is", "▁a", "▁test"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [13, 1, 4398, 25, 21, 1289])
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            ["▁", "I", "▁was", "▁born", "▁in", "▁9", "2000", ",", "▁and", "▁this", "▁is", "▁fal", "s", "é", "."],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(ids, [13, 1, 23, 386, 19, 561, 3050, 15, 17, 48, 25, 8256, 18, 1, 9])
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                "▁",
-                "",
-                "▁was",
-                "▁born",
-                "▁in",
-                "▁9",
-                "2000",
-                ",",
-                "▁and",
-                "▁this",
-                "▁is",
-                "▁fal",
-                "s",
-                "",
-                ".",
-            ],
-        )
-
-    def test_sequence_builders(self):
-        tokenizer = FNetTokenizer(SAMPLE_VOCAB)
-
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
-        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
-            tokenizer.sep_token_id
-        ]
-
-    # Overridden Tests - loading the fast tokenizer from slow just takes too long
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("", lstrip=True)]
-
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                r_output = tokenizer_r.encode("Hey this is a  token")
-
-                special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a  token")
-
-                    cr_output = tokenizer_r.encode("Hey this is a  token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    @tooslow
-    def test_special_tokens_initialization_from_slow(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("", lstrip=True)]
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                )
-                special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0]
-                tokenizer_p = self.tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-
-                p_output = tokenizer_p.encode("Hey this is a  token")
-                cr_output = tokenizer_r.encode("Hey this is a  token")
-
-                self.assertEqual(p_output, cr_output)
-                self.assertTrue(special_token_id in p_output)
-                self.assertTrue(special_token_id in cr_output)
-
-    # Overridden Tests
-    def test_padding(self, max_length=50):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                # Encode - Simple input
-                input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, padding="max_length")
-                input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, padding="max_length")
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.encode("This is a simple input", padding="longest")
-                input_p = tokenizer_p.encode("This is a simple input", padding=True)
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode - Pair input
-                input_r = tokenizer_r.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r, input_p, max_length, pad_token_id)
-                input_r = tokenizer_r.encode("This is a simple input", "This is a pair", padding=True)
-                input_p = tokenizer_p.encode("This is a simple input", "This is a pair", padding="longest")
-                self.assert_padded_input_match(input_r, input_p, len(input_r), pad_token_id)
-
-                # Encode_plus - Simple input
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-
-                input_r = tokenizer_r.encode_plus("This is a simple input", padding="longest")
-                input_p = tokenizer_p.encode_plus("This is a simple input", padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                # Encode_plus - Pair input
-                input_r = tokenizer_r.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus(
-                    "This is a simple input", "This is a pair", max_length=max_length, padding="max_length"
-                )
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-
-                input_r = tokenizer_r.encode_plus("This is a simple input", "This is a pair", padding="longest")
-                input_p = tokenizer_p.encode_plus("This is a simple input", "This is a pair", padding=True)
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                # Batch_encode_plus - Simple input
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding="longest",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"],
-                    max_length=max_length,
-                    padding=True,
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"], padding="longest"
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    ["This is a simple input 1", "This is a simple input 2"], padding=True
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Batch_encode_plus - Pair input
-                input_r = tokenizer_r.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    max_length=max_length,
-                    truncation=True,
-                    padding="max_length",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-                input_r = tokenizer_r.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    padding=True,
-                )
-                input_p = tokenizer_p.batch_encode_plus(
-                    [
-                        ("This is a simple input 1", "This is a simple input 2"),
-                        ("This is a simple pair 1", "This is a simple pair 2"),
-                    ],
-                    padding="longest",
-                )
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus("This is a input 1")
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_r.encode_plus("This is a input 1")
-                input_p = tokenizer_r.pad(input_p)
-
-                self.assert_padded_input_match(
-                    input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
-                )
-
-                # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus("This is a input 1")
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_r.encode_plus("This is a input 1")
-                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-
-                self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
-
-                # Using pad after tokenization
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_r = tokenizer_r.pad(input_r)
-
-                input_p = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_p = tokenizer_r.pad(input_p)
-
-                self.assert_batch_padded_input_match(input_r, input_p, len(input_r["input_ids"][0]), pad_token_id)
-
-                # Using pad after tokenization
-                input_r = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
-
-                input_p = tokenizer_r.batch_encode_plus(
-                    ["This is a input 1", "This is a much longer input whilch should be padded"]
-                )
-                input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
-
-                self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
-
-    @slow
-    def test_save_pretrained(self):
-        super().test_save_pretrained()
-
-    @slow
-    def test_save_slow_from_fast_and_reload_fast(self):
-        super().test_save_slow_from_fast_and_reload_fast()
-
-    def assert_batch_padded_input_match(
-        self,
-        input_r: dict,
-        input_p: dict,
-        max_length: int,
-        pad_token_id: int,
-        model_main_input_name: str = "input_ids",
-    ):
-        for i_r in input_r.values():
-            (
-                self.assertEqual(len(i_r), 2),
-                self.assertEqual(len(i_r[0]), max_length),
-                self.assertEqual(len(i_r[1]), max_length),
-            )
-            (
-                self.assertEqual(len(i_r), 2),
-                self.assertEqual(len(i_r[0]), max_length),
-                self.assertEqual(len(i_r[1]), max_length),
-            )
-
-        for i_r, i_p in zip(input_r[model_main_input_name], input_p[model_main_input_name]):
-            self.assert_padded_input_match(i_r, i_p, max_length, pad_token_id)
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[4, 4616, 107, 163, 328, 14, 63, 1726, 106, 11954, 16659, 23, 83, 16688, 11427, 328, 107, 36, 11954, 16659, 23, 83, 16688, 6153, 82, 961, 16688, 3474, 16710, 1696, 2306, 16688, 10854, 2524, 3827, 561, 163, 3474, 16680, 62, 226, 2092, 16680, 379, 3474, 16660, 16680, 2436, 16667, 16671, 16680, 999, 87, 3474, 16680, 2436, 16667, 5208, 800, 16710, 68, 2018, 2959, 3037, 163, 16663, 11617, 16710, 36, 2018, 2959, 4737, 163, 16663, 16667, 16674, 16710, 91, 372, 5087, 16745, 2205, 82, 961, 3608, 38, 1770, 16745, 7984, 36, 2565, 751, 9017, 1204, 864, 218, 1244, 16680, 11954, 16659, 23, 83, 36, 14686, 23, 7619, 16678, 5], [4, 28, 532, 65, 1929, 33, 391, 16688, 3979, 9, 2565, 7849, 299, 225, 34, 2040, 305, 167, 289, 16667, 16078, 32, 1966, 181, 4626, 63, 10575, 71, 851, 1491, 36, 624, 4757, 38, 208, 8038, 16678, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], [4, 13, 1467, 5187, 26, 2521, 4567, 16664, 372, 13, 16209, 3314, 16678, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="google/fnet-base",
-            revision="34219a71ca20e280cc6000b89673a169c65d605c",
-        )
diff --git a/tests/models/funnel/test_tokenization_funnel.py b/tests/models/funnel/test_tokenization_funnel.py
index 38b70b87625b..77041fdb1d7e 100644
--- a/tests/models/funnel/test_tokenization_funnel.py
+++ b/tests/models/funnel/test_tokenization_funnel.py
@@ -13,11 +13,9 @@
 # limitations under the License.
 
 
-import os
 import unittest
 
-from transformers import FunnelTokenizer, FunnelTokenizerFast
-from transformers.models.funnel.tokenization_funnel import VOCAB_FILES_NAMES
+from transformers import FunnelTokenizer
 from transformers.testing_utils import require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
@@ -27,61 +25,8 @@
 class FunnelTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "funnel-transformer/small"
     tokenizer_class = FunnelTokenizer
-    rust_tokenizer_class = FunnelTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab_tokens = [
-            "",
-            "",
-            "",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return FunnelTokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return FunnelTokenizerFast.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-    def test_token_type_ids(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            inputs = tokenizer("UNwant\u00e9d,running")
-            sentence_len = len(inputs["input_ids"]) - 1
-            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len)
-
-            inputs = tokenizer("UNwant\u00e9d,running", "UNwant\u00e9d,running")
-            self.assertListEqual(inputs["token_type_ids"], [2] + [0] * sentence_len + [1] * sentence_len)
+    integration_expected_tokens = ['', 'is', 'a', 'test', '', '', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '', '.', '生', '', '的', '真', '', '', '', '', '', '', '', '', 'hi', '', 'there', '', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '', '.', '', 'ir', '##d', 'and', '', 'ir', '##d', '', '', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_token_ids = [100, 2003, 1037, 3231, 100, 100, 2001, 2141, 1999, 6227, 8889, 2692, 1010, 1998, 2023, 2003, 100, 1012, 1910, 100, 1916, 1921, 100, 100, 100, 100, 100, 100, 100, 96, 7632, 96, 2045, 100, 2206, 5164, 2323, 2022, 7919, 12359, 1024, 100, 1012, 100, 20868, 2094, 1998, 100, 20868, 2094, 100, 100, 2129, 2024, 2017, 2725]  # fmt: skip
+    expected_tokens_from_ids = ['', 'is', 'a', 'test', '', '', 'was', 'born', 'in', '92', '##00', '##0', ',', 'and', 'this', 'is', '', '.', '生', '', '的', '真', '', '', '', '', '', '', '', '', 'hi', '', 'there', '', 'following', 'string', 'should', 'be', 'properly', 'encoded', ':', '', '.', '', 'ir', '##d', 'and', '', 'ir', '##d', '', '', 'how', 'are', 'you', 'doing']  # fmt: skip
+    integration_expected_decoded_text = " is a test   was born in 92000, and this is . 生  的 真         hi  there  following string should be properly encoded : .  ird and  ird   how are you doing"
diff --git a/tests/models/gemma/test_tokenization_gemma.py b/tests/models/gemma/test_tokenization_gemma.py
index 0bae68e4b0e3..5e63065fbd2a 100644
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@@ -12,508 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
-import tempfile
-import unittest
 
-from datasets import load_dataset
+import unittest
 
-from transformers import (
-    AddedToken,
-    GemmaTokenizer,
-    GemmaTokenizerFast,
-)
-from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from tests.test_tokenization_common import TokenizerTesterMixin
+from transformers.models.gemma.tokenization_gemma import GemmaTokenizer
 from transformers.testing_utils import (
-    get_tests_dir,
-    nested_simplify,
     require_read_token,
-    require_sentencepiece,
     require_tokenizers,
-    require_torch,
-    slow,
 )
 
-from ...test_tokenization_common import TokenizerTesterMixin
-
 
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
-
-
-@require_sentencepiece
 @require_tokenizers
+@require_read_token
 class GemmaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "google/gemma-7b"
+    from_pretrained_id = "hf-internal-testing/dummy-gemma"
     tokenizer_class = GemmaTokenizer
-    rust_tokenizer_class = GemmaTokenizerFast
-
-    test_rust_tokenizer = False
-    test_sentencepiece = True
-    from_pretrained_kwargs = {}
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        # We have a SentencePiece fixture for testing
-        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
-        tokenizer.pad_token = tokenizer.eos_token
-        tokenizer.save_pretrained(cls.tmpdirname)
-
-    @require_torch
-    def test_batch_tokenization(self):
-        if not self.test_seq2seq:
-            self.skipTest(reason="test_seq2seq is set to False")
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Longer text that will definitely require truncation.
-                text = [
-                    " UN Chief Says There Is No Military Solution in Syria",
-                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
-                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
-                    " will only worsen the violence and misery for millions of people.",
-                ]
-                try:
-                    batch = tokenizer(
-                        text=text,
-                        max_length=3,
-                        return_tensors="pt",
-                    )
-                except NotImplementedError:
-                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                # max_target_length will default to max_length if not specified
-                batch = tokenizer(text, max_length=3, return_tensors="pt")
-                self.assertEqual(batch.input_ids.shape[1], 3)
-
-                batch_encoder_only = tokenizer(text=text, max_length=3, return_tensors="pt")
-                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
-                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
-                self.assertNotIn("decoder_input_ids", batch_encoder_only)
-
-    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
-    def test_save_slow_from_fast_and_reload_fast(self):
-        pass
-
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("", lstrip=True)]
-
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                r_output = tokenizer_r.encode("Hey this is a  token")
-
-                special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.get_rust_tokenizer(
-                        pretrained_name,
-                        additional_special_tokens=added_tokens,
-                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a  token")
-
-                    cr_output = tokenizer_cr.encode("Hey this is a  token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    @slow
-    @require_read_token
-    def test_tokenizer_integration(self):
-        expected_encoding =  {'input_ids': [[2, 158434, 591, 84193, 3836, 685, 6599, 31223, 235290, 140247, 578, 6599, 31223, 235290, 145139, 235290, 3491, 235275, 6572, 3311, 235290, 38197, 109959, 591, 25894, 235269, 162174, 235290, 235284, 235269, 1791, 6362, 12481, 235269, 1576, 18622, 235269, 2900, 1136, 86684, 235269, 29092, 4632, 16994, 604, 13146, 14944, 40371, 591, 19700, 235327, 235275, 578, 13146, 14944, 25511, 591, 235300, 12474, 235275, 675, 1163, 235248, 235304, 235284, 235340, 229903, 5377, 575, 235248, 235274, 235276, 235276, 235340, 17044, 578, 5271, 1061, 118345, 1865, 125247, 235269, 8745, 111226, 578, 176888, 235265], [2, 25894, 603, 6869, 577, 953, 235290, 8297, 5271, 209099, 41642, 774, 748, 78253, 2793, 731, 51506, 34346, 611, 2145, 2731, 578, 1833, 4807, 575, 832, 16630, 235265], [2, 651, 4320, 8426, 25341, 36271, 1163, 573, 27894, 5929, 235265]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="google/gemma-2b",
-            padding=False,
-        )
-
-    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
-    def test_subword_regularization_tokenizer(self):
-        pass
-
-    @unittest.skip(reason="Skipping")
-    def test_torch_encode_plus_sent_to_model(self):
-        pass
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class GemmaIntegrationTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        checkpoint_name = "hf-internal-testing/dummy-gemma"
-        cls.tokenizer: GemmaTokenizer = GemmaTokenizer.from_pretrained(
-            checkpoint_name, eos_token=""
-        )  # add this token
-        cls.rust_tokenizer = GemmaTokenizerFast.from_pretrained(
-            checkpoint_name, eos_token="", from_slow=True
-        )  # add this token
-        return cls
-
-    @require_torch
-    def integration_tests(self):
-        inputs = self.tokenizer(
-            ["The following string should be properly encoded: Hello.", "But ird and ปี   ird   ด"],
-            return_tensors="pt",
-        )
-
-        self.assertEqual(
-            nested_simplify(inputs),
-            {
-                "input_ids": [
-                    [2, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889],
-                    [2, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718],
-                ],
-                "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
-            },
-        )
-
-    def test_user_added_tokens(self):
-        # Ensure that user added tokens are not split in the fast tokenizer
-        slow_tokenizer = self.tokenizer
-        fast_tokenizer = self.rust_tokenizer
-
-        user_added_token = ""
-
-        slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token))
-        fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token))
-
-        self.assertTrue(user_added_token in fast_tokens)
-        self.assertEqual(slow_tokens, fast_tokens)
-
-    def test_fast_special_tokens(self):
-        slow_tokenizer = self.tokenizer
-        fast_tokenizer = self.rust_tokenizer
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [2, 235280, 6453, 2121]
-
-        fast_tokenizer.add_eos_token = False
-        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert fast == [2, 235280, 6453, 2121]
-
-        fast_tokenizer.add_eos_token = True
-        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert fast == [2, 235280, 6453, 2121, 204]
-
-        slow_tokenizer.add_eos_token = True
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [2, 235280, 6453, 2121, 204]
-
-        self.tokenizer.add_eos_token = False
-        self.rust_tokenizer.add_eos_token = False
-
-    def test_fast_merge_priority(self):
-        slow_tokenizer = self.tokenizer
-        fast_tokenizer = self.rust_tokenizer
-        text = "                                               "
-        target = [168, 153]
-        slow = slow_tokenizer.encode(text, add_special_tokens=False)
-        assert slow == target
-
-        fast = fast_tokenizer.encode(text, add_special_tokens=False)
-        assert fast == target
-
-    @unittest.skip(reason="Not super important and always failing. Let's skip it")
-    @slow
-    def test_conversion(self):
-        # This is excruciatingly slow since it has to recreate the entire merge
-        # list from the original vocabulary in spm
-        self.rust_tokenizer.save_pretrained("./out")
-        with tempfile.TemporaryDirectory() as dirname:
-            self.rust_tokenizer.save_pretrained(dirname)
-
-            with open(os.path.join(dirname, "tokenizer.json")) as f:
-                old_serialized = f.read()
-
-        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
-        with tempfile.NamedTemporaryFile() as f:
-            new_tokenizer.save(f.name)
-            # Re-opening since `f` is in bytes.
-            new_serialized = open(f.name).read()
-            with open("out_tokenizer.json", "w") as g:
-                g.write(new_serialized)
-
-            self.assertEqual(old_serialized, new_serialized)
-
-    def test_simple_encode_decode(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        self.tokenizer.add_eos_token = False
-        self.rust_tokenizer.add_eos_token = False
-
-        self.assertEqual(pyth_tokenizer.encode("This is a test"), [2, 1596, 603, 476, 2121])
-        self.assertEqual(rust_tokenizer.encode("This is a test"), [2, 1596, 603, 476, 2121])
-        self.assertEqual(pyth_tokenizer.decode([2, 1596, 603, 476, 2121], skip_special_tokens=True), "This is a test")
-        self.assertEqual(rust_tokenizer.decode([2, 1596, 603, 476, 2121], skip_special_tokens=True), "This is a test")
-
-        # bytefallback showcase
-        self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [2, 122182, 235710, 245467, 235427] )  # fmt: skip
-        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [2, 122182, 235710, 245467, 235427] )  # fmt: skip
-        self.assertEqual(
-            pyth_tokenizer.decode([2, 122182, 235710, 245467, 235427], skip_special_tokens=True),
-            "生活的真谛是",
-        )
-        self.assertEqual(
-            rust_tokenizer.decode([2, 122182, 235710, 245467, 235427], skip_special_tokens=True),
-            "生活的真谛是",
-        )
-
-        # Inner spaces showcase
-        self.assertEqual(pyth_tokenizer.encode("Hi  Hello"), [2, 2151, 139, 4521])
-        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [2, 2151, 139, 4521])
-        self.assertEqual(pyth_tokenizer.decode([2, 2151, 139, 4521], skip_special_tokens=True), "Hi  Hello")
-        self.assertEqual(rust_tokenizer.decode([2, 2151, 139, 4521], skip_special_tokens=True), "Hi  Hello")
-
-        self.assertEqual(pyth_tokenizer.encode("Hi   Hello"), [2, 2151, 140, 4521])
-        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [2, 2151, 140, 4521])
-        self.assertEqual(pyth_tokenizer.decode([2, 2151, 140, 4521], skip_special_tokens=True), "Hi   Hello")
-        self.assertEqual(rust_tokenizer.decode([2, 2151, 140, 4521], skip_special_tokens=True), "Hi   Hello")
-
-        self.assertEqual(pyth_tokenizer.encode(""), [2])
-        self.assertEqual(rust_tokenizer.encode(""), [2])
-
-        self.assertEqual(pyth_tokenizer.encode(" "), [2, 235248])
-        self.assertEqual(rust_tokenizer.encode(" "), [2, 235248])
-
-        self.assertEqual(pyth_tokenizer.encode("  "), [2, 139])
-        self.assertEqual(rust_tokenizer.encode("  "), [2, 139])
-
-        self.assertEqual(pyth_tokenizer.encode(" Hello"), [2, 25957])
-        self.assertEqual(rust_tokenizer.encode(" Hello"), [2, 25957])
-
-    def test_no_differences_decode(self):
-        self.tokenizer.add_eos_token = False
-        self.rust_tokenizer.add_eos_token = False
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        self.assertEqual(pyth_tokenizer.decode([869]), "og")
-        self.assertEqual(rust_tokenizer.decode([869]), "og")
-
-        self.assertEqual(pyth_tokenizer.decode([30112, 869]), " expenditureog")
-        self.assertEqual(rust_tokenizer.decode([30112, 869]), " expenditureog")
-
-    def test_no_differences_special_tokens(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-        self.assertEqual(pyth_tokenizer.encode(""), [2])
-        self.assertEqual(rust_tokenizer.encode(""), [2])
-
-        self.assertEqual(pyth_tokenizer.encode(""), [2, 204])
-        self.assertEqual(rust_tokenizer.encode(""), [2, 204])
-
-    @unittest.skipIf(
-        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
-        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
-    )
-    def test_integration_test_xnli(self):
-        import tqdm
-
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
-        for item in tqdm.tqdm(dataset["validation"]):
-            string = item["code"]
-            encoded1 = pyth_tokenizer.encode(string)
-            encoded2 = rust_tokenizer.encode(string)
-
-            self.assertEqual(
-                encoded1,
-                encoded2,
-                msg="Hint: the following tokenization diff were obtained for slow vs fast:\n "
-                f"elements in slow: {set(pyth_tokenizer.tokenize(string)) - set(rust_tokenizer.tokenize(string))} \nvs\n "
-                f"elements in fast: {set(rust_tokenizer.tokenize(string)) - set(pyth_tokenizer.tokenize(string))} \n\n{string}",
-            )
-
-            decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
-            decoded2 = rust_tokenizer.decode(encoded1, skip_special_tokens=True)
-
-            self.assertEqual(decoded1, decoded2)
-
-        dataset = load_dataset("facebook/xnli", "all_languages")
-
-        for item in tqdm.tqdm(dataset["train"]):
-            for string in item["premise"].values():
-                encoded1 = pyth_tokenizer.encode(string)
-                encoded2 = rust_tokenizer.encode(string)
-
-                self.assertEqual(encoded1, encoded2, msg=f"failed on {string}")
-
-                decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
-                decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
-
-                self.assertEqual(decoded1, decoded2)
-
-    def test_special_token_special_word(self):
-        # the word inform should be split as ['in', 'form']
-        tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
-        tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
-        out1 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        self.assertEqual(out1, "inform")
-        out2 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
-        )
-        # decoding strips the added prefix space.
-        self.assertEqual(out2, " inform")
-        input_ids = tokenizer.encode("inform", add_special_tokens=False)
-        self.assertEqual(input_ids, [256000, 43910])
-
-        out2 = tokenizer.decode(
-            tokenizer.encode("  inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
-        self.assertEqual(out2, "inform")
-
-        ### Let's make sure decoding does not add extra spaces here and there
-        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
-        # Since currently we always strip left and right of the token, results are as such
-        input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False)
-        self.assertEqual(input_ids, [204, 25957, 204, 1139])
-        tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False)
-        self.assertEqual(tokens, ["", "▁Hello", "", "how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " Hellohow")
-
-        # Let's make sure that if there are any spaces, we don't remove them!
-        input_ids = tokenizer.encode("  Hello how", add_special_tokens=False)
-        self.assertEqual(input_ids, [235248, 204, 25957, 204, 1368])
-        tokens = tokenizer.tokenize("  Hello how", add_special_tokens=False)
-        self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "  Hello how")
-
-    def test_some_edge_cases(self):
-        tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
-
-        sp_tokens = tokenizer.sp_model.encode(">", out_type=str)
-        self.assertEqual(sp_tokens, ["", ">"])
-        tokens = tokenizer.tokenize(">")
-        self.assertEqual(sp_tokens, tokens)
-        self.assertEqual(tokens, ["", ">"])
-
-        tokens = tokenizer.tokenize("")
-        self.assertEqual(tokens, [])
-        self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
-
-        tokens = tokenizer.tokenize(" ")
-        self.assertEqual(tokens, ["▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
-
-        tokens = tokenizer.tokenize("▁")
-        self.assertEqual(tokens, ["▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
-
-        tokens = tokenizer.tokenize(" ▁")
-        self.assertEqual(tokens, ["▁▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
-
-    def test_save_fast_load_slow(self):
-        # Ensure that we can save a fast tokenizer and load it as a slow tokenizer
-        slow_tokenizer = self.tokenizer
-        text = "a  "
-        target_encoded = [2, 235250, 139]
-        slow = slow_tokenizer.encode(text, add_special_tokens=True)
-        assert slow == target_encoded
-
-        slow_decoded = slow_tokenizer.decode(slow, skip_special_tokens=True)
-        assert slow_decoded == text
-
-        with tempfile.TemporaryDirectory() as dirname:
-            # Save fast tokenizer
-            self.rust_tokenizer.save_pretrained(dirname)
-
-            # Load slow tokenizer with fast files present in the directory
-            slow_tokenizer_from_fast = GemmaTokenizer.from_pretrained(dirname)
-
-        slow_from_fast = slow_tokenizer_from_fast.encode(text, add_special_tokens=True)
-        assert slow_from_fast == target_encoded
-
-        slow_from_fast_decoded = slow_tokenizer_from_fast.decode(slow, skip_special_tokens=True)
-        assert slow_from_fast_decoded == text
-
-
-@require_sentencepiece
-@require_tokenizers
-class CommonSpmIntegrationTests(unittest.TestCase):
-    """
-    A class that regroups important test to make sure that we properly handle the special tokens.
-    """
-
-    def test_edge_case_tabulation(self):
-        fast_tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
-        slow_tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
-        input_text = "Hey. \t\t \n\nyou  é  @#😈  🤗!       , 1234 15 5,61"
-        EXPECTED_IDS = [ 2, 6750, 1, 235265, 235248, 255969, 235248, 109, 4747, 139, 235335, 139, 216311, 241316, 139, 239880, 235341, 144, 235269, 235248, 235274, 235284, 235304, 235310, 235248, 235274, 235308, 235248, 235308, 235269, 235318, 235274]  # fmt: skip
-        EXPECTED_TOKENS = [ "Hey", "", ".", "▁", "\t\t", "▁", "\n\n", "you", "▁▁", "é", "▁▁", "@#", "😈", "▁▁", "🤗", "!", "▁▁▁▁▁▁▁", ",", "▁", "1", "2", "3", "4", "▁", "1", "5", "▁", "5", ",", "6", "1"]  # fmt: skip
-
-        tokens = fast_tokenizer.tokenize(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(tokens, EXPECTED_TOKENS)
-
-        tokens = slow_tokenizer.tokenize(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(tokens, EXPECTED_TOKENS)
-
-        input_ids = fast_tokenizer.encode(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(input_ids, EXPECTED_IDS)
-
-        input_ids = slow_tokenizer.encode(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(input_ids, EXPECTED_IDS)
-
-        text = fast_tokenizer.decode(EXPECTED_IDS)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(text, "Hey. \t\t \n\nyou  é  @#😈  🤗!       , 1234 15 5,61")
-
-        text = slow_tokenizer.decode(EXPECTED_IDS)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(text, "Hey. \t\t \n\nyou  é  @#😈  🤗!       , 1234 15 5,61")
-
-        input_text = "\t\t\t\t \n\n61"
-        EXPECTED_IDS = [2, 255971, 235248, 109, 235318, 235274]
-        EXPECTED_TOKENS = ["\t\t\t\t", "▁", "\n\n", "6", "1"]
-
-        tokens = fast_tokenizer.tokenize(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(tokens, EXPECTED_TOKENS)
-
-        tokens = slow_tokenizer.tokenize(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(tokens, EXPECTED_TOKENS)
-
-        input_ids = fast_tokenizer.encode(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(input_ids, EXPECTED_IDS)
-
-        input_ids = slow_tokenizer.encode(input_text)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(input_ids, EXPECTED_IDS)
-
-        text = fast_tokenizer.decode(EXPECTED_IDS)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(text, "\t\t\t\t \n\n61")
 
-        text = slow_tokenizer.decode(EXPECTED_IDS)
-        with self.subTest("test fast edge case fast"):
-            self.assertEqual(text, "\t\t\t\t \n\n61")
+    integration_expected_tokens = ['This', '▁is', '▁a', '▁test', '▁😊', '\n', 'I', '▁was', '▁born', '▁in', '▁', '9', '2', '0', '0', '0', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '\n', '生活的', '真', '谛', '是', '\n', 'Hi', '▁▁', 'Hello', '\n', 'Hi', '▁▁▁', 'Hello', '\n\n', '▁', '\n', '▁▁', '\n', '▁Hello', '\n', '<', 's', '>', '\n', 'hi', '<', 's', '>', 'there', '\n', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '\n', 'But', '▁i', 'rd', '▁and', '▁ปี', '▁▁▁', 'ird', '▁▁▁', 'ด', '\n', 'Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_token_ids = [1596, 603, 476, 2121, 44416, 108, 235285, 729, 7565, 575, 235248, 235315, 235284, 235276, 235276, 235276, 235269, 578, 736, 603, 40751, 235335, 235265, 108, 122182, 235710, 245467, 235427, 108, 2151, 139, 4521, 108, 2151, 140, 4521, 109, 235248, 108, 139, 108, 25957, 108, 235322, 235256, 235313, 108, 544, 235322, 235256, 235313, 11048, 108, 651, 2412, 2067, 1412, 614, 10338, 49748, 235292, 25957, 235265, 108, 1860, 496, 1924, 578, 73208, 140, 5650, 140, 235732, 108, 6750, 1368, 708, 692, 3900]  # fmt: skip
+    expected_tokens_from_ids = ['This', '▁is', '▁a', '▁test', '▁😊', '\n', 'I', '▁was', '▁born', '▁in', '▁', '9', '2', '0', '0', '0', ',', '▁and', '▁this', '▁is', '▁fals', 'é', '.', '\n', '生活的', '真', '谛', '是', '\n', 'Hi', '▁▁', 'Hello', '\n', 'Hi', '▁▁▁', 'Hello', '\n\n', '▁', '\n', '▁▁', '\n', '▁Hello', '\n', '<', 's', '>', '\n', 'hi', '<', 's', '>', 'there', '\n', 'The', '▁following', '▁string', '▁should', '▁be', '▁properly', '▁encoded', ':', '▁Hello', '.', '\n', 'But', '▁i', 'rd', '▁and', '▁ปี', '▁▁▁', 'ird', '▁▁▁', 'ด', '\n', 'Hey', '▁how', '▁are', '▁you', '▁doing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py
index 455731b71ca9..4f565806afbc 100644
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -40,10 +40,7 @@ def _setup_image_processor(cls):
             "pan_and_scan_max_num_crops": 4,
             "pan_and_scan_min_ratio_to_activate": 1.2,
         }
-        image_processor = image_processor_class.from_pretrained(
-            "google/siglip-so400m-patch14-384", **gemma3_image_processor_kwargs
-        )
-        return image_processor
+        return image_processor_class(**gemma3_image_processor_kwargs)
 
     @classmethod
     def _setup_tokenizer(cls):
@@ -128,7 +125,15 @@ def test_pan_and_scan(self):
 
         # base image + 4 crops
         self.assertEqual(len(inputs[self.images_input_name]), 5)
-        self.assertEqual(len(inputs[self.text_input_name][0]), 67)
+        baseline = processor(
+            text=input_str,
+            images=image_input,
+            return_tensors="pt",
+            do_pan_and_scan=False,
+            image_seq_length=2,
+            pan_and_scan_min_crop_size=10,
+        )
+        self.assertGreater(len(inputs[self.text_input_name][0]), len(baseline[self.text_input_name][0]))
 
     def test_special_mm_token_truncation(self):
         """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
diff --git a/tests/models/gpt2/test_tokenization_gpt2.py b/tests/models/gpt2/test_tokenization_gpt2.py
index be6b90bc4637..de85bde7666d 100644
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@@ -13,122 +13,24 @@
 # limitations under the License.
 
 
-import json
-import os
 import unittest
 
-from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
-from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tiktoken, require_tokenizers
+from transformers import AutoTokenizer, GPT2Tokenizer
+from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
 @require_tokenizers
 class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "openai-community/gpt2"
+    from_pretrained_id = ["openai-community/gpt2"]
     tokenizer_class = GPT2Tokenizer
-    rust_tokenizer_class = GPT2TokenizerFast
-    test_rust_tokenizer = True
-    from_pretrained_kwargs = {"add_prefix_space": True}
-    test_seq2seq = False
+    from_pretrained_kwargs = {"add_prefix_space": False}
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "",
-            "<|endoftext|>",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        cls.special_tokens_map = {"unk_token": ""}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return GPT2Tokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return GPT2TokenizerFast.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
-        text = "lower newer"
-        bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
-        tokens = tokenizer.tokenize(text, add_prefix_space=True)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-
-        sequence = "lower newer"
-
-        # Testing tokenization
-        tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        # Testing conversion to ids without special tokens
-        ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing conversion to ids with special tokens
-        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
-        ids = tokenizer.encode(sequence, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-        # Testing the unknown token
-        input_tokens = tokens + [rust_tokenizer.unk_token]
-        input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
-        self.assertListEqual(rust_tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+    integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'Ġ', 'Ġ', 'ĠHello', 'ĊĊ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [1212, 318, 257, 1332, 30325, 232, 198, 40, 373, 4642, 287, 10190, 830, 11, 290, 428, 318, 27807, 2634, 13, 198, 37955, 162, 112, 119, 21410, 40367, 253, 164, 108, 249, 42468, 198, 17250, 220, 18435, 198, 17250, 220, 220, 18435, 628, 220, 198, 220, 220, 198, 18435, 198, 27, 82, 29, 198, 5303, 27, 82, 29, 8117, 198, 464, 1708, 4731, 815, 307, 6105, 30240, 25, 18435, 13, 198, 1537, 220, 1447, 290, 220, 19567, 249, 19567, 113, 220, 220, 220, 1447, 220, 220, 220, 19567, 242, 198, 10814, 703, 389, 345, 1804]  # fmt: skip
+    expected_tokens_from_ids = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ92', '000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ', '´', '»', 'çļĦ', 'çľ', 'Ł', 'è', '°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', 'Ġ', 'ĠHello', 'Ċ', 'Hi', 'Ġ', 'Ġ', 'ĠHello', 'ĊĊ', 'Ġ', 'Ċ', 'Ġ', 'Ġ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'à¸', 'µ', 'Ġ', 'Ġ', 'Ġ', 'ird', 'Ġ', 'Ġ', 'Ġ', 'à¸', 'Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
 
     @unittest.skip
     def test_pretokenized_inputs(self, *args, **kwargs):
@@ -136,122 +38,6 @@ def test_pretokenized_inputs(self, *args, **kwargs):
         # And get both GPT2 and Roberta to work at the same time (mostly an issue of adding a space before the string)
         pass
 
-    def test_padding(self, max_length=15):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-
-                # Simple input
-                s = "This is a simple input"
-                s2 = ["This is a simple input 1", "This is a simple input 2"]
-                p = ("This is a simple input", "This is a pair")
-                p2 = [
-                    ("This is a simple input 1", "This is a simple input 2"),
-                    ("This is a simple pair 1", "This is a simple pair 2"),
-                ]
-
-                # Simple input tests
-                self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, padding="max_length")
-
-                # Simple input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    s2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, padding="max_length")
-
-                # Pair input
-                self.assertRaises(
-                    ValueError,
-                    tokenizer_r.batch_encode_plus,
-                    p2,
-                    max_length=max_length,
-                    padding="max_length",
-                )
-
-    def test_padding_if_pad_token_set_slow(self):
-        tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, pad_token="")
-
-        # Simple input
-        s = "This is a simple input"
-        s2 = ["This is a simple input looooooooong", "This is a simple input"]
-        p = ("This is a simple input", "This is a pair")
-        p2 = [
-            ("This is a simple input loooooong", "This is a simple input"),
-            ("This is a simple pair loooooong", "This is a simple pair"),
-        ]
-
-        pad_token_id = tokenizer.pad_token_id
-
-        out_s = tokenizer(s, padding="max_length", max_length=30, return_tensors="np")
-        out_s2 = tokenizer(s2, padding=True, truncate=True, return_tensors="np")
-        out_p = tokenizer(*p, padding="max_length", max_length=60, return_tensors="np")
-        out_p2 = tokenizer(p2, padding=True, truncate=True, return_tensors="np")
-
-        # s
-        # test single string max_length padding
-        self.assertEqual(out_s["input_ids"].shape[-1], 30)
-        self.assertTrue(pad_token_id in out_s["input_ids"])
-        self.assertTrue(0 in out_s["attention_mask"])
-
-        # s2
-        # test automatic padding
-        self.assertEqual(out_s2["input_ids"].shape[-1], 33)
-        # long slice doesn't have padding
-        self.assertFalse(pad_token_id in out_s2["input_ids"][0])
-        self.assertFalse(0 in out_s2["attention_mask"][0])
-        # short slice does have padding
-        self.assertTrue(pad_token_id in out_s2["input_ids"][1])
-        self.assertTrue(0 in out_s2["attention_mask"][1])
-
-        # p
-        # test single pair max_length padding
-        self.assertEqual(out_p["input_ids"].shape[-1], 60)
-        self.assertTrue(pad_token_id in out_p["input_ids"])
-        self.assertTrue(0 in out_p["attention_mask"])
-
-        # p2
-        # test automatic padding pair
-        self.assertEqual(out_p2["input_ids"].shape[-1], 52)
-        # long slice pair doesn't have padding
-        self.assertFalse(pad_token_id in out_p2["input_ids"][0])
-        self.assertFalse(0 in out_p2["attention_mask"][0])
-        # short slice pair does have padding
-        self.assertTrue(pad_token_id in out_p2["input_ids"][1])
-        self.assertTrue(0 in out_p2["attention_mask"][1])
-
-    def test_add_bos_token_slow(self):
-        bos_token = "$$$"
-        tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, bos_token=bos_token, add_bos_token=True)
-
-        s = "This is a simple input"
-        s2 = ["This is a simple input 1", "This is a simple input 2"]
-
-        bos_token_id = tokenizer.bos_token_id
-
-        out_s = tokenizer(s)
-        out_s2 = tokenizer(s2)
-
-        self.assertEqual(out_s.input_ids[0], bos_token_id)
-        self.assertTrue(all(o[0] == bos_token_id for o in out_s2.input_ids))
-
-        decode_s = tokenizer.decode(out_s.input_ids)
-        decode_s2 = tokenizer.batch_decode(out_s2.input_ids)
-
-        self.assertTrue(decode_s.startswith(bos_token))
-        self.assertTrue(all(d.startswith(bos_token) for d in decode_s2))
-
     @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
@@ -265,7 +51,7 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
                 sequence_1 = "This one too please."
                 encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
                 encoded_sequence += tokenizer.encode(sequence_1, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
+                encoded_sequence_dict = tokenizer(
                     sequence_0,
                     sequence_1,
                     add_special_tokens=True,
@@ -281,6 +67,26 @@ def test_special_tokens_mask_input_pairs_and_bos_token(self):
                 filtered_sequence = [x for x in filtered_sequence if x is not None]
                 self.assertEqual(encoded_sequence, filtered_sequence)
 
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [[1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256], [1639, 389, 257, 7613, 8537, 13645, 13, 50256, 15496, 0, 50256, 35284, 284, 1826, 345, 13, 50256], [35284, 284, 1826, 345, 13, 50256, 15496, 0, 50256]]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
     @require_tiktoken
     def test_tokenization_tiktoken(self):
         from tiktoken import encoding_name_for_model
@@ -290,8 +96,8 @@ def test_tokenization_tiktoken(self):
         encoding = encoding_name_for_model("gpt2")
         convert_tiktoken_to_fast(encoding, self.tmpdirname)
 
-        tiktoken_fast_tokenizer = GPT2TokenizerFast.from_pretrained(self.tmpdirname)
-        rust_tokenizer = GPT2TokenizerFast.from_pretrained("openai-community/gpt2")
+        tiktoken_fast_tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
+        rust_tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
         sequence = "lower newer"
         self.assertEqual(
             rust_tokenizer.decode(rust_tokenizer.encode(sequence)),
@@ -307,7 +113,7 @@ def test_serialize_deserialize_fast_opt(self):
         # https://huggingface.slack.com/archives/C01N44FJDHT/p1653511495183519
         # https://github.com/huggingface/transformers/pull/17088#discussion_r871246439
 
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
         text = "A photo of a cat"
 
         tokens_ids = tokenizer.encode(
@@ -323,7 +129,7 @@ def test_serialize_deserialize_fast_opt(self):
         self.assertEqual(tokens_ids, [2, 250, 1345, 9, 10, 4758])
 
     def test_fast_slow_equivalence(self):
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", use_slow=True)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
         text = "A photo of a cat"
 
         tokens_ids = tokenizer.encode(
@@ -334,7 +140,7 @@ def test_fast_slow_equivalence(self):
 
     @unittest.skip(reason="This test is failing because of a bug in the fast tokenizer")
     def test_users_can_modify_bos(self):
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", from_slow=True)
+        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 
         tokenizer.bos_token = "bos"
         tokenizer.bos_token_id = tokenizer.get_vocab()["bos"]
diff --git a/tests/models/gpt_neox/test_tokenization_gpt_neox.py b/tests/models/gpt_neox/test_tokenization_gpt_neox.py
new file mode 100644
index 000000000000..d72c189fa09e
--- /dev/null
+++ b/tests/models/gpt_neox/test_tokenization_gpt_neox.py
@@ -0,0 +1,31 @@
+# Copyright 2022 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import GPTNeoXTokenizer
+from transformers.testing_utils import require_tokenizers
+
+from ...test_tokenization_common import TokenizerTesterMixin
+
+
+@require_tokenizers
+class GPTNeoXTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    from_pretrained_id = "EleutherAI/gpt-neox-20b"
+    tokenizer_class = GPTNeoXTokenizer
+
+    integration_expected_tokens = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ´»', 'çļĦ', '羣', 'è°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', '  ', 'Hello', 'Ċ', 'Hi', '   ', 'Hello', 'ĊĊĠĊ', '  ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'ี', '   ', 'ird', '   ', 'à¸Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_token_ids = [1552, 310, 247, 1071, 49042, 221, 187, 42, 369, 5686, 275, 898, 6914, 13, 285, 436, 310, 21649, 860, 15, 187, 20025, 46549, 5225, 48561, 33656, 238, 12105, 187, 12764, 50276, 12092, 187, 12764, 50275, 12092, 46603, 50276, 187, 24387, 187, 29, 84, 31, 187, 5801, 29, 84, 31, 9088, 187, 510, 1563, 2876, 943, 320, 6283, 16202, 27, 24387, 15, 187, 1989, 209, 1817, 285, 209, 2869, 238, 26863, 50275, 1817, 50275, 35071, 187, 8262, 849, 403, 368, 2509]  # fmt: skip
+    expected_tokens_from_ids = ['This', 'Ġis', 'Ġa', 'Ġtest', 'ĠðŁĺ', 'Ĭ', 'Ċ', 'I', 'Ġwas', 'Ġborn', 'Ġin', 'Ġ9', '2000', ',', 'Ġand', 'Ġthis', 'Ġis', 'Ġfals', 'é', '.', 'Ċ', 'çĶŁ', 'æ´»', 'çļĦ', '羣', 'è°', 'Ľ', 'æĺ¯', 'Ċ', 'Hi', '  ', 'Hello', 'Ċ', 'Hi', '   ', 'Hello', 'ĊĊĠĊ', '  ', 'Ċ', 'ĠHello', 'Ċ', '<', 's', '>', 'Ċ', 'hi', '<', 's', '>', 'there', 'Ċ', 'The', 'Ġfollowing', 'Ġstring', 'Ġshould', 'Ġbe', 'Ġproperly', 'Ġencoded', ':', 'ĠHello', '.', 'Ċ', 'But', 'Ġ', 'ird', 'Ġand', 'Ġ', 'à¸', 'Ľ', 'ี', '   ', 'ird', '   ', 'à¸Ķ', 'Ċ', 'Hey', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
diff --git a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
index 0e4732f27062..39be0da37bc9 100644
--- a/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
+++ b/tests/models/gpt_neox_japanese/test_tokenization_gpt_neox_japanese.py
@@ -138,3 +138,7 @@ def test_conversion_reversible(self):
     @unittest.skip(reason="tokenizer has no padding token")
     def test_padding_different_model_input_name(self):
         pass
+
+    @unittest.skip(reason="sequence_ids() is not available for Python backend tokenizers")
+    def test_sequence_ids(self):
+        pass
diff --git a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
index c77eaecede2a..6d3fd89a91ea 100644
--- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
+++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@@ -15,7 +15,7 @@
 import unittest
 
 from transformers import GPTSw3Tokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
@@ -37,7 +37,9 @@ def setUpClass(cls):
         super().setUpClass()
 
         # We have a SentencePiece fixture for testing
-        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, eos_token="", bos_token="", pad_token="")
+        tokenizer = GPTSw3Tokenizer(
+            SAMPLE_VOCAB, eos_token="", bos_token="", pad_token="", name_or_path="test"
+        )
 
         tokenizer.save_pretrained(cls.tmpdirname)
 
@@ -66,7 +68,7 @@ def test_vocab_size(self):
         self.assertEqual(self.get_tokenizer().vocab_size, 2_000)
 
     def test_full_tokenizer(self):
-        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
+        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, name_or_path="test")
 
         tokens = tokenizer.tokenize("This is a test")
         self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
@@ -96,7 +98,7 @@ def test_full_tokenizer(self):
         # fmt: on
 
     def test_fast_encode_decode(self):
-        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
+        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, name_or_path="test")
         texts = ["This is a test", "I was born in 92000, and this is falsé."]
         expected_ids_list = [
             [465, 287, 265, 631, 842],
@@ -127,3 +129,36 @@ def test_tokenizer_integration(self):
             model_name="AI-Sweden-Models/gpt-sw3-126m",
             sequences=sequences,
         )
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB, name_or_path="test")
+        tokenizer.chat_template = (
+            "{{ eos_token }}{{ bos_token }}"
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
+            "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
+            "{{ message['text'] }}{{ bos_token }}"
+            "{% endfor %}"
+            "Bot:"
+        )
+        # This is in English, but it's just here to make sure the chat control tokens are being added properly
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
+            ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
diff --git a/tests/models/herbert/test_tokenization_herbert.py b/tests/models/herbert/test_tokenization_herbert.py
index 6bd95000d620..90270f799e4c 100644
--- a/tests/models/herbert/test_tokenization_herbert.py
+++ b/tests/models/herbert/test_tokenization_herbert.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors, Allegro.pl and The HuggingFace Inc. team.
+# Copyright 2020 HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,129 +13,20 @@
 # limitations under the License.
 
 
-import json
-import os
 import unittest
 
-from transformers import HerbertTokenizer, HerbertTokenizerFast
-from transformers.models.herbert.tokenization_herbert import VOCAB_FILES_NAMES
-from transformers.testing_utils import get_tests_dir, require_sacremoses, require_tokenizers, slow
+from transformers import HerbertTokenizer
+from transformers.testing_utils import require_tokenizers
 
 from ...test_tokenization_common import TokenizerTesterMixin
 
 
-@require_sacremoses
 @require_tokenizers
 class HerbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "allegro/herbert-base-cased"
     tokenizer_class = HerbertTokenizer
-    rust_tokenizer_class = HerbertTokenizerFast
-    test_rust_tokenizer = True
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        # Use a simpler test file without japanese/chinese characters
-        with open(f"{get_tests_dir()}/fixtures/sample_text_no_unicode.txt", encoding="utf-8") as f_data:
-            cls._data = f_data.read().replace("\n\n", "\n").strip()
-
-        vocab = [
-            "",
-            "",
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "w",
-            "r",
-            "t",
-            "lo",
-            "low",
-            "er",
-            "low",
-            "lowest",
-            "newer",
-            "wider",
-            ",",
-            "",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["l o 123", "lo w 1456", "e r 1789", ""]
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w") as fp:
-            fp.write(json.dumps(vocab_tokens))
-        with open(cls.merges_file, "w") as fp:
-            fp.write("\n".join(merges))
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "lower newer"
-        output_text = "lower newer"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(vocab_file=self.vocab_file, merges_file=self.merges_file)
-
-        text = "lower"
-        bpe_tokens = ["low", "er"]
-        tokens = tokenizer.tokenize(text)
-        self.assertListEqual(tokens, bpe_tokens)
-
-        input_tokens = tokens + [""]
-        input_bpe_tokens = [16, 17, 23]
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        sequence = "lower,newer"
-
-        tokens = tokenizer.tokenize(sequence)
-        rust_tokens = rust_tokenizer.tokenize(sequence)
-        self.assertListEqual(tokens, rust_tokens)
-
-        ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        rust_tokenizer = self.get_rust_tokenizer()
-        ids = tokenizer.encode(sequence)
-        rust_ids = rust_tokenizer.encode(sequence)
-        self.assertListEqual(ids, rust_ids)
-
-    @slow
-    def test_sequence_builders(self):
-        tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased")
-
-        text = tokenizer.encode("konstruowanie sekwencji", add_special_tokens=False)
-        text_2 = tokenizer.encode("konstruowanie wielu sekwencji", add_special_tokens=False)
-
-        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
-        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
-
-        assert encoded_sentence == [0] + text + [2]
-        assert encoded_pair == [0] + text + [2] + text_2 + [2]
-
-    @unittest.skip(
-        "Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later"
-    )
-    def test_training_new_tokenizer_with_special_tokens_change(self):
-        pass
-
-    @unittest.skip(
-        "Test passes if run individually but not with the full tests (internal state of the tokenizer is modified). Will fix later"
-    )
-    def test_training_new_tokenizer(self):
-        pass
+    integration_expected_tokens = ['T', 'his', 'is', 'a', 'test', '', 'I', 'was', 'bor', 'n', 'in', '9', '2000', ',', 'and', 'this', 'is', 'fal', 's', 'é', '.', '', '', '', '', '', '是', 'H', 'i', 'Hel', 'lo', 'H', 'i', 'Hel', 'lo', 'Hel', 'lo', '', 'hi', '', 'ther', 'e', 'The', 'fol', 'low', 'ing', 'str', 'ing', 'sho', 'uld', 'be', 'pro', 'per', 'ly', 'en', 'c', 'ode', 'd', ':', 'Hel', 'lo', '.', 'Bu', 't', 'ir', 'd', 'and', '', 'ี', 'ir', 'd', 'ด', 'He', 'y', 'ho', 'w', 'are', 'you', 'do', 'ing']  # fmt: skip
+    integration_expected_token_ids = [56, 22855, 6869, 1011, 14825, 3, 1056, 9873, 2822, 1016, 2651, 29, 3450, 1947, 7158, 48846, 6869, 7355, 87, 1093, 1899, 3, 3, 3, 3, 3, 1776, 44, 1009, 12156, 6170, 44, 1009, 12156, 6170, 12156, 6170, 0, 21566, 0, 40445, 1015, 7117, 9929, 13194, 5129, 15948, 5129, 14924, 48273, 11072, 2088, 3040, 8172, 2058, 71, 3909, 1038, 1335, 12156, 6170, 1899, 3025, 1026, 17435, 1038, 7158, 3, 1085, 17435, 1038, 1579, 4596, 1005, 3145, 1019, 25720, 20254, 2065, 5129]  # fmt: skip
+    expected_tokens_from_ids = ['T', 'his', 'is', 'a', 'test', '', 'I', 'was', 'bor', 'n', 'in', '9', '2000', ',', 'and', 'this', 'is', 'fal', 's', 'é', '.', '', '', '', '', '', '是', 'H', 'i', 'Hel', 'lo', 'H', 'i', 'Hel', 'lo', 'Hel', 'lo', '', 'hi', '', 'ther', 'e', 'The', 'fol', 'low', 'ing', 'str', 'ing', 'sho', 'uld', 'be', 'pro', 'per', 'ly', 'en', 'c', 'ode', 'd', ':', 'Hel', 'lo', '.', 'Bu', 't', 'ir', 'd', 'and', '', 'ี', 'ir', 'd', 'ด', 'He', 'y', 'ho', 'w', 'are', 'you', 'do', 'ing']  # fmt: skip
+    integration_expected_decoded_text = "This is a test I was born in 92000 , and this is falsé . 是 Hi Hello Hi Hello Hello hi there The following string should be properly encoded : Hello . But ird and ี ird ด Hey how are you doing"
diff --git a/tests/models/idefics/test_processing_idefics.py b/tests/models/idefics/test_processing_idefics.py
index ceb5a0f0a65c..f49942a4a4d8 100644
--- a/tests/models/idefics/test_processing_idefics.py
+++ b/tests/models/idefics/test_processing_idefics.py
@@ -110,8 +110,8 @@ def test_tokenizer_padding(self):
         processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor, return_tensors="pt")
 
         predicted_tokens = [
-            " Describe this image.\nAssistant:",
-            " Describe this image.\nAssistant:",
+            "Describe this image.\nAssistant:",
+            "Describe this image.\nAssistant:",
         ]
         predicted_attention_masks = [
             ([1] * 10) + ([0] * 9),
@@ -136,8 +136,8 @@ def test_tokenizer_left_padding(self):
         processor = self.get_processor()
 
         predicted_tokens = [
-            " Describe this image.\nAssistant:",
-            " Describe this image.\nAssistant:",
+            "Describe this image.\nAssistant:",
+            "Describe this image.\nAssistant:",
         ]
         predicted_attention_masks = [
             ([0] * 9) + ([1] * 10),
@@ -155,3 +155,19 @@ def test_tokenizer_left_padding(self):
 
         self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1])
         self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0])
+
+    def test_tokenizer_defaults(self):
+        # Override to account for the processor prefixing the BOS token to prompts.
+        components = {attribute: self.get_component(attribute) for attribute in self.processor_class.get_attributes()}
+        processor = self.processor_class(**components)
+        tokenizer = components["tokenizer"]
+
+        input_str = ["lower newer"]
+        encoded_processor = processor(text=input_str, padding=False, return_tensors="pt")
+        encoded_tok = tokenizer(
+            [f"{tokenizer.bos_token}{input_str[0]}"], padding=False, add_special_tokens=False, return_tensors="pt"
+        )
+
+        for key in encoded_tok:
+            if key in encoded_processor:
+                self.assertListEqual(encoded_tok[key].tolist(), encoded_processor[key].tolist())
diff --git a/tests/models/kosmos2/test_processing_kosmos2.py b/tests/models/kosmos2/test_processing_kosmos2.py
index b2d83a25639f..04388b5159da 100644
--- a/tests/models/kosmos2/test_processing_kosmos2.py
+++ b/tests/models/kosmos2/test_processing_kosmos2.py
@@ -27,6 +27,7 @@
     require_torch,
     require_vision,
 )
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
 from transformers.utils import is_vision_available
 
 from ...test_processing_common import ProcessorTesterMixin, url_to_local_path
@@ -36,10 +37,10 @@
     from PIL import Image
 
     from transformers import (
+        AutoProcessor,
         CLIPImageProcessor,
         Kosmos2Processor,
         XLMRobertaTokenizer,
-        XLMRobertaTokenizerFast,
     )
 
 
@@ -55,9 +56,15 @@ class Kosmos2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     @classmethod
     def _setup_tokenizer(cls):
         # We have a SentencePiece fixture for testing
-        slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
-        fast_tokenizer = XLMRobertaTokenizerFast(__slow_tokenizer=slow_tokenizer)
-        return fast_tokenizer
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        _, vocab_scores, _ = extractor.extract()
+        return XLMRobertaTokenizer(vocab=vocab_scores)
+
+    def get_tokenizer(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
+
+    def get_image_processor(self, **kwargs):
+        return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
 
     @classmethod
     def _setup_image_processor(cls):
diff --git a/tests/models/layoutlm/test_tokenization_layoutlm.py b/tests/models/layoutlm/test_tokenization_layoutlm.py
deleted file mode 100644
index 83df38a3b153..000000000000
--- a/tests/models/layoutlm/test_tokenization_layoutlm.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright 2018 The Microsoft Research Asia LayoutLM Team Authors, The Hugging Face Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast
-from transformers.models.layoutlm.tokenization_layoutlm import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tokenizers
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class LayoutLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "microsoft/layoutlm-base-uncased"
-    tokenizer_class = LayoutLMTokenizer
-    rust_tokenizer_class = LayoutLMTokenizerFast
-    test_rust_tokenizer = True
-    space_between_special_tokens = True
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab_tokens = [
-            "[UNK]",
-            "[CLS]",
-            "[SEP]",
-            "want",
-            "##want",
-            "##ed",
-            "wa",
-            "un",
-            "runn",
-            "##ing",
-            ",",
-            "low",
-            "lowest",
-        ]
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return LayoutLMTokenizer.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "UNwant\u00e9d,running"
-        output_text = "unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file)
-
-        tokens = tokenizer.tokenize("UNwant\u00e9d,running")
-        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
-
-    @unittest.skip
-    def test_special_tokens_as_you_expect(self):
-        """If you are training a seq2seq model that expects a decoder_prefix token make sure it is prepended to decoder_input_ids"""
-        pass
diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
index b3e7adc68257..9c538e3dcef0 100644
--- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
+++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py
@@ -23,20 +23,14 @@
 
 from transformers import (
     AddedToken,
-    LayoutLMv2TokenizerFast,
-    SpecialTokensMixin,
+    LayoutLMv2Tokenizer,
+    PreTrainedTokenizerBase,
     is_mlx_available,
     is_torch_available,
     logging,
 )
 from transformers.models.layoutlmv2.tokenization_layoutlmv2 import (
     VOCAB_FILES_NAMES,
-    BasicTokenizer,
-    LayoutLMv2Tokenizer,
-    WordpieceTokenizer,
-    _is_control,
-    _is_punctuation,
-    _is_whitespace,
 )
 from transformers.testing_utils import (
     require_detectron2,
@@ -62,8 +56,8 @@
 class LayoutLMv2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "microsoft/layoutlmv2-base-uncased"
     tokenizer_class = LayoutLMv2Tokenizer
-    rust_tokenizer_class = LayoutLMv2TokenizerFast
-    test_rust_tokenizer = True
+    rust_tokenizer_class = LayoutLMv2Tokenizer
+    test_rust_tokenizer = False
     space_between_special_tokens = True
     from_pretrained_filter = filter_non_english
     test_seq2seq = False
@@ -157,120 +151,57 @@ def setUpClass(cls):
         with open(cls.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
+        # Load vocab from file and pass to tokenizer
+        vocab = {}
+        with open(cls.vocab_file, "r", encoding="utf-8") as reader:
+            for index, line in enumerate(reader):
+                token = line.rstrip("\n")
+                vocab[token] = index
+
+        tokenizer = cls.tokenizer_class(vocab=vocab)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
         return input_text, output_text
 
-    def test_chinese(self):
-        tokenizer = BasicTokenizer()
-
-        self.assertListEqual(tokenizer.tokenize("ah\u535a\u63a8zz"), ["ah", "\u535a", "\u63a8", "zz"])
-
-    def test_basic_tokenizer_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["h\u00e9llo"])
-
-    def test_basic_tokenizer_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_lower_strip_accents_default(self):
-        tokenizer = BasicTokenizer(do_lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
-        )
-        self.assertListEqual(tokenizer.tokenize("H\u00e9llo"), ["hello"])
-
-    def test_basic_tokenizer_no_lower(self):
-        tokenizer = BasicTokenizer(do_lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_false(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+    def convert_batch_encode_plus_format_to_encode_plus(self, batch_encode_plus_sequences):
+        """Helper method to convert batch_encode_plus output to list of encode_plus outputs"""
+        # Get the batch size
+        first_key = list(batch_encode_plus_sequences.keys())[0]
+        batch_size = len(batch_encode_plus_sequences[first_key])
 
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_no_lower_strip_accents_true(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_basic_tokenizer_respects_never_split_tokens(self):
-        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+        # Convert to list of dicts
+        encode_plus_sequences = []
+        for i in range(batch_size):
+            single_sequence = {}
+            for key, value in batch_encode_plus_sequences.items():
+                if key != "encodings":  # Skip the encodings attribute
+                    single_sequence[key] = value[i]
+            encode_plus_sequences.append(single_sequence)
 
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
-        )
+        return encode_plus_sequences
 
     @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
-    def test_wordpiece_tokenizer(self):
-        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-
-        vocab = {}
-        for i, token in enumerate(vocab_tokens):
-            vocab[token] = i
-        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
-
-        self.assertListEqual(tokenizer.tokenize(""), [])
-
-        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
-
-        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
-
-    def test_is_whitespace(self):
-        self.assertTrue(_is_whitespace(" "))
-        self.assertTrue(_is_whitespace("\t"))
-        self.assertTrue(_is_whitespace("\r"))
-        self.assertTrue(_is_whitespace("\n"))
-        self.assertTrue(_is_whitespace("\u00a0"))
-
-        self.assertFalse(_is_whitespace("A"))
-        self.assertFalse(_is_whitespace("-"))
-
-    def test_is_control(self):
-        self.assertTrue(_is_control("\u0005"))
+    @unittest.skip(reason="LayoutLMv2 requires pre-tokenized words, not strings.")
+    def test_bos_token_with_add_bos_token_false(self):
+        pass
 
-        self.assertFalse(_is_control("A"))
-        self.assertFalse(_is_control(" "))
-        self.assertFalse(_is_control("\t"))
-        self.assertFalse(_is_control("\r"))
+    @unittest.skip(reason="LayoutLMv2 requires pre-tokenized words, not strings.")
+    def test_bos_token_with_add_bos_token_true(self):
+        pass
 
-    def test_is_punctuation(self):
-        self.assertTrue(_is_punctuation("-"))
-        self.assertTrue(_is_punctuation("$"))
-        self.assertTrue(_is_punctuation("`"))
-        self.assertTrue(_is_punctuation("."))
+    @unittest.skip(reason="LayoutLMv2 requires pre-tokenized words with boxes.")
+    def test_encode_basic_padding(self):
+        pass
 
-        self.assertFalse(_is_punctuation("A"))
-        self.assertFalse(_is_punctuation(" "))
+    @unittest.skip(reason="LayoutLMv2 requires pre-tokenized words with boxes.")
+    def test_pad_token_initialization(self):
+        pass
 
     def test_clean_text(self):
         tokenizer = self.get_tokenizer()
@@ -298,11 +229,11 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
                 words[1] = tokenizer_r.mask_token
-                tokens = tokenizer_r.encode_plus(
+                tokens = tokenizer_r(
                     words,
                     boxes=boxes,
                     return_attention_mask=False,
@@ -438,7 +369,7 @@ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
                 padding_size = 10
                 padding_idx = tokenizer.pad_token_id
 
-                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_special_tokens_mask=True)
+                encoded_sequence = tokenizer(words, boxes=boxes, return_special_tokens_mask=True)
                 input_ids = encoded_sequence["input_ids"]
                 special_tokens_mask = encoded_sequence["special_tokens_mask"]
                 sequence_length = len(input_ids)
@@ -446,7 +377,7 @@ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
                 # Test 'longest' and 'no_padding' don't do anything
                 tokenizer.padding_side = "right"
 
-                not_padded_sequence = tokenizer.encode_plus(
+                not_padded_sequence = tokenizer(
                     words,
                     boxes=boxes,
                     padding=False,
@@ -461,7 +392,7 @@ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
                 self.assertTrue(input_ids == not_padded_input_ids)
                 self.assertTrue(special_tokens_mask == not_padded_special_tokens_mask)
 
-                not_padded_sequence = tokenizer.encode_plus(
+                not_padded_sequence = tokenizer(
                     words,
                     boxes=boxes,
                     padding=False,
@@ -488,7 +419,7 @@ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
                 else:
                     tokenizer_kwargs_right["padding_side"] = "right"
 
-                right_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_right)
+                right_padded_sequence = tokenizer(words, boxes=boxes, **tokenizer_kwargs_right)
                 right_padded_input_ids = right_padded_sequence["input_ids"]
 
                 right_padded_special_tokens_mask = right_padded_sequence["special_tokens_mask"]
@@ -510,7 +441,7 @@ def test_encode_plus_with_padding(self, use_padding_as_call_kwarg: bool):
                 else:
                     tokenizer_kwargs_left["padding_side"] = "left"
 
-                left_padded_sequence = tokenizer.encode_plus(words, boxes=boxes, **tokenizer_kwargs_left)
+                left_padded_sequence = tokenizer(words, boxes=boxes, **tokenizer_kwargs_left)
                 left_padded_input_ids = left_padded_sequence["input_ids"]
                 left_padded_special_tokens_mask = left_padded_sequence["special_tokens_mask"]
                 left_padded_sequence_length = len(left_padded_input_ids)
@@ -566,7 +497,7 @@ def test_mask_output(self):
                     tokenizer.build_inputs_with_special_tokens.__qualname__.split(".")[0] != "PreTrainedTokenizer"
                     and "token_type_ids" in tokenizer.model_input_names
                 ):
-                    information = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
+                    information = tokenizer(words, boxes=boxes, add_special_tokens=True)
                     sequences, mask = information["input_ids"], information["token_type_ids"]
                     self.assertEqual(len(sequences), len(mask))
 
@@ -601,7 +532,7 @@ def test_number_of_added_tokens(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
                 tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
@@ -628,13 +559,13 @@ def test_padding(self, max_length=50):
 
                 # Encode_plus - Simple input
                 words, boxes = self.get_words_and_boxes()
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=max_length, padding="max_length")
+                input_r = tokenizer_r(words, boxes=boxes, max_length=max_length, padding="max_length")
+                input_p = tokenizer_p(words, boxes=boxes, max_length=max_length, padding="max_length")
                 self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
                 self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
 
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes, padding=True)
+                input_r = tokenizer_r(words, boxes=boxes, padding="longest")
+                input_p = tokenizer_p(words, boxes=boxes, padding=True)
                 self.assert_padded_input_match(
                     input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
                 )
@@ -643,16 +574,12 @@ def test_padding(self, max_length=50):
 
                 # Encode_plus - Pair input
                 question, words, boxes = self.get_question_words_and_boxes()
-                input_r = tokenizer_r.encode_plus(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
-                input_p = tokenizer_p.encode_plus(
-                    question, words, boxes=boxes, max_length=max_length, padding="max_length"
-                )
+                input_r = tokenizer_r(question, words, boxes=boxes, max_length=max_length, padding="max_length")
+                input_p = tokenizer_p(question, words, boxes=boxes, max_length=max_length, padding="max_length")
                 self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
                 self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
-                input_r = tokenizer_r.encode_plus(question, words, boxes=boxes, padding="longest")
-                input_p = tokenizer_p.encode_plus(question, words, boxes=boxes, padding=True)
+                input_r = tokenizer_r(question, words, boxes=boxes, padding="longest")
+                input_p = tokenizer_p(question, words, boxes=boxes, padding=True)
                 self.assert_padded_input_match(
                     input_r["input_ids"], input_p["input_ids"], len(input_r["input_ids"]), pad_token_id
                 )
@@ -730,10 +657,10 @@ def test_padding(self, max_length=50):
 
                 # Using pad on single examples after tokenization
                 words, boxes = self.get_words_and_boxes()
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
+                input_r = tokenizer_r(words, boxes=boxes)
                 input_r = tokenizer_r.pad(input_r)
 
-                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
+                input_p = tokenizer_r(words, boxes=boxes)
                 input_p = tokenizer_r.pad(input_p)
 
                 self.assert_padded_input_match(
@@ -741,10 +668,10 @@ def test_padding(self, max_length=50):
                 )
 
                 # Using pad on single examples after tokenization
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
+                input_r = tokenizer_r(words, boxes=boxes)
                 input_r = tokenizer_r.pad(input_r, max_length=max_length, padding="max_length")
 
-                input_p = tokenizer_r.encode_plus(words, boxes=boxes)
+                input_p = tokenizer_r(words, boxes=boxes)
                 input_p = tokenizer_r.pad(input_p, max_length=max_length, padding="max_length")
 
                 self.assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length, pad_token_id)
@@ -781,49 +708,6 @@ def test_padding(self, max_length=50):
 
                 self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
 
-    def test_padding_warning_message_fast_tokenizer(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        words, boxes = self.get_words_and_boxes_batch()
-
-        tokenizer_fast = self.get_rust_tokenizer()
-
-        encoding_fast = tokenizer_fast(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs("transformers", level="WARNING") as cm:
-            tokenizer_fast.pad(encoding_fast)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
-            " encode the text followed by a call to the `pad` method to get a padded encoding.",
-            cm.records[0].message,
-        )
-
-        if not self.test_slow_tokenizer:
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer_slow = self.get_tokenizer()
-
-        encoding_slow = tokenizer_slow(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs(level="WARNING") as cm:
-            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
-            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
-            logger.warning("Dummy warning")
-            tokenizer_slow.pad(encoding_slow)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Dummy warning",
-            cm.records[0].message,
-        )
-
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -831,13 +715,13 @@ def test_call(self):
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 # Test not batched
                 words, boxes = self.get_words_and_boxes()
-                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
+                encoded_sequences_1 = tokenizer(words, boxes=boxes)
                 encoded_sequences_2 = tokenizer(words, boxes=boxes)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
                 # Test not batched pairs
                 question, words, boxes = self.get_question_words_and_boxes()
-                encoded_sequences_1 = tokenizer.encode_plus(words, boxes=boxes)
+                encoded_sequences_1 = tokenizer(words, boxes=boxes)
                 encoded_sequences_2 = tokenizer(words, boxes=boxes)
                 self.assertEqual(encoded_sequences_1, encoded_sequences_2)
 
@@ -855,8 +739,7 @@ def test_batch_encode_plus_batch_sequence_length(self):
                 words, boxes = self.get_words_and_boxes_batch()
 
                 encoded_sequences = [
-                    tokenizer.encode_plus(words_example, boxes=boxes_example)
-                    for words_example, boxes_example in zip(words, boxes)
+                    tokenizer(words_example, boxes=boxes_example) for words_example, boxes_example in zip(words, boxes)
                 ]
                 encoded_sequences_batch = tokenizer.batch_encode_plus(words, is_pair=False, boxes=boxes, padding=False)
                 self.assertListEqual(
@@ -871,9 +754,7 @@ def test_batch_encode_plus_batch_sequence_length(self):
                 self._check_no_pad_token_padding(tokenizer, words)
 
                 encoded_sequences_padded = [
-                    tokenizer.encode_plus(
-                        words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length"
-                    )
+                    tokenizer(words_example, boxes=boxes_example, max_length=maximum_length, padding="max_length")
                     for words_example, boxes_example in zip(words, boxes)
                 ]
 
@@ -930,9 +811,7 @@ def test_batch_encode_plus_padding(self):
                 self._check_no_pad_token_padding(tokenizer, words)
 
                 encoded_sequences = [
-                    tokenizer.encode_plus(
-                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
-                    )
+                    tokenizer(words_example, boxes=boxes_example, max_length=max_length, padding="max_length")
                     for words_example, boxes_example in zip(words, boxes)
                 ]
                 encoded_sequences_batch = tokenizer.batch_encode_plus(
@@ -955,9 +834,7 @@ def test_batch_encode_plus_padding(self):
                 self._check_no_pad_token_padding(tokenizer, words)
 
                 encoded_sequences = [
-                    tokenizer.encode_plus(
-                        words_example, boxes=boxes_example, max_length=max_length, padding="max_length"
-                    )
+                    tokenizer(words_example, boxes=boxes_example, max_length=max_length, padding="max_length")
                     for words_example, boxes_example in zip(words, boxes)
                 ]
                 encoded_sequences_batch = tokenizer.batch_encode_plus(
@@ -1004,46 +881,13 @@ def test_padding_to_multiple_of(self):
                         pad_to_multiple_of=8,
                     )
 
-    def test_tokenizer_slow_store_full_signature(self):
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_build_inputs_with_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                # Input tokens id
-                words, boxes = self.get_words_and_boxes()
-                input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
-                input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
-
-                # Generate output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                self.assertEqual(output_p, output_r)
-
-                # Generate pair output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
-
     def test_special_tokens_mask_input_pairs(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 words, boxes = self.get_words_and_boxes()
                 encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
+                encoded_sequence_dict = tokenizer(
                     words,
                     boxes=boxes,
                     add_special_tokens=True,
@@ -1067,7 +911,7 @@ def test_special_tokens_mask(self):
                 words, boxes = self.get_words_and_boxes()
                 # Testing single inputs
                 encoded_sequence = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-                encoded_sequence_dict = tokenizer.encode_plus(
+                encoded_sequence_dict = tokenizer(
                     words, boxes=boxes, add_special_tokens=True, return_special_tokens_mask=True
                 )
                 encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
@@ -1208,7 +1052,7 @@ def test_offsets_mapping(self):
                 boxes = [[1, 8, 12, 20] for _ in range(len(text))]
 
                 # No pair
-                tokens_with_offsets = tokenizer_r.encode_plus(
+                tokens_with_offsets = tokenizer_r(
                     text,
                     boxes=boxes,
                     return_special_tokens_mask=True,
@@ -1228,7 +1072,7 @@ def test_offsets_mapping(self):
                 text = "what's his name"
                 pair = ["a", "wonderful", "test"]
                 boxes = [[1, 8, 12, 20] for _ in range(len(pair))]
-                tokens_with_offsets = tokenizer_r.encode_plus(
+                tokens_with_offsets = tokenizer_r(
                     text,
                     pair,
                     boxes=boxes,
@@ -1279,7 +1123,7 @@ def test_torch_encode_plus_sent_to_model(self):
 
                 # Build sequence
                 words, boxes = self.get_words_and_boxes()
-                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
+                encoded_sequence = tokenizer(words, boxes=boxes, return_tensors="pt")
                 batch_encoded_sequence = tokenizer.batch_encode_plus(
                     [words, words], boxes=[boxes, boxes], return_tensors="pt"
                 )
@@ -1294,112 +1138,6 @@ def test_torch_encode_plus_sent_to_model(self):
                     model(**encoded_sequence)
                     model(**batch_encoded_sequence)
 
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        words, boxes = self.get_words_and_boxes()
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_tokenization_python_rust_equals(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                words, boxes = self.get_words_and_boxes()
-
-                # Ensure basic input match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-
-                words = ["hello" for _ in range(1000)]
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
-
-                # Ensure truncation match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                # Ensure truncation with stride match
-                input_p = tokenizer_p.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-                input_r = tokenizer_r.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-
-    def test_embedded_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                words, boxes = self.get_words_and_boxes()
-                tokens_r = tokenizer_r.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-                tokens_p = tokenizer_p.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-
-                for key in tokens_p:
-                    self.assertEqual(tokens_r[key], tokens_p[key])
-
-                if "token_type_ids" in tokens_r:
-                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_r, tokens_p)
-
     def test_compare_add_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1419,8 +1157,8 @@ def test_compare_add_special_tokens(self):
                 self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
 
                 # encode_plus()
-                no_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=False)
-                with_special_tokens = tokenizer_r.encode_plus(words, boxes=boxes, add_special_tokens=True)
+                no_special_tokens = tokenizer_r(words, boxes=boxes, add_special_tokens=False)
+                with_special_tokens = tokenizer_r(words, boxes=boxes, add_special_tokens=True)
                 for key in no_special_tokens:
                     self.assertEqual(
                         len(no_special_tokens[key]),
@@ -1498,31 +1236,8 @@ def test_special_tokens_initialization(self):
 
                 self.assertTrue(special_token_id in r_output)
 
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    words = "Hey this is a  token".split()
-                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                    p_output = tokenizer_p.encode(words, boxes=boxes)
-                    cr_output = tokenizer_cr.encode(words, boxes=boxes)
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
     def test_training_new_tokenizer(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
 
         # Test we can use the new tokenizer with something not seen during training
@@ -1548,18 +1263,14 @@ def test_training_new_tokenizer(self):
 
         # Assert the set of special tokens match as we didn't ask to change them
         self.assertSequenceEqual(
-            tokenizer.all_special_tokens_extended,
-            new_tokenizer.all_special_tokens_extended,
+            tokenizer.all_special_tokens,
+            new_tokenizer.all_special_tokens,
         )
 
         self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
 
     def test_training_new_tokenizer_with_special_tokens_change(self):
-        # This feature only exists for fast tokenizers
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         # Test with a special tokens map
         class_signature = inspect.signature(tokenizer.__class__)
         if "cls_token" in class_signature.parameters:
@@ -1571,8 +1282,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
             self.assertEqual(new_tokenizer.cls_token_id, cls_id)
 
         # Create a new mapping from the special tokens defined in the original tokenizer
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
+        special_tokens_list = PreTrainedTokenizerBase.SPECIAL_TOKENS_ATTRIBUTES.copy()
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
@@ -1599,12 +1309,12 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
 
         # Check if the AddedToken / string format has been kept
-        for special_token in tokenizer.all_special_tokens_extended:
+        for special_token in tokenizer.all_special_tokens:
             if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer.all_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens}",
                 )
             elif isinstance(special_token, AddedToken):
                 # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
@@ -1613,7 +1323,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 new_special_token_str = special_tokens_map[special_token_str]
 
                 find = False
-                for candidate in new_tokenizer.all_special_tokens_extended:
+                for candidate in new_tokenizer.all_special_tokens:
                     if (
                         isinstance(candidate, AddedToken)
                         and candidate.content == new_special_token_str
@@ -1627,19 +1337,19 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{new_special_token_str}' doesn't appear in the list "
-                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
-                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
+                    f"'{new_tokenizer.all_special_tokens}' as an AddedToken with the same parameters as "
+                    f"'{special_token}' in the list {tokenizer.all_special_tokens}",
                 )
             elif special_token not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer.all_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens}",
                 )
 
             else:
                 # The special token must appear in the list of the new tokenizer as an object of type string.
-                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
+                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens)
 
         # Test we can use the new tokenizer with something not seen during training
         words = [["this", "is"], ["hello", "🤗"]]
@@ -1657,52 +1367,16 @@ def test_prepare_for_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             # only test prepare_for_model for the slow tokenizer
-            if tokenizer.__class__.__name__ == "LayoutLMv2TokenizerFast":
+            if tokenizer.__class__.__name__ == "LayoutLMv2Tokenizer":
                 continue
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 words, boxes = self.get_words_and_boxes()
                 prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
 
-                input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
+                input_dict = tokenizer(words, boxes=boxes, add_special_tokens=True)
 
                 self.assertEqual(input_dict, prepared_input_dict)
 
-    def test_padding_different_model_input_name(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                words, boxes = self.get_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-                input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-
-                # rename encoded batch to "inputs"
-                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
-                del input_r[tokenizer_r.model_input_names[0]]
-
-                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
-                del input_p[tokenizer_p.model_input_names[0]]
-
-                # Renaming `input_ids` to `inputs`
-                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
-                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
-
-                input_r = tokenizer_r.pad(input_r, padding="longest")
-                input_p = tokenizer_r.pad(input_p, padding="longest")
-
-                max_length = len(input_p["inputs"][0])
-                self.assert_batch_padded_input_match(
-                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
-                )
-
     def test_batch_encode_dynamic_overflowing(self):
         """
         When calling batch_encode with multiple sequences, it can return different number of
@@ -1722,7 +1396,7 @@ def test_batch_encode_dynamic_overflowing(self):
 
                 # Single example
                 words, boxes = self.get_words_and_boxes()
-                tokens = tokenizer.encode_plus(
+                tokens = tokenizer(
                     words,
                     boxes=boxes,
                     max_length=6,
@@ -1979,7 +1653,7 @@ def test_maximum_encoding_length_pair_input(self):
                 )
 
                 # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                if isinstance(tokenizer, LayoutLMv2Tokenizer):
                     information = tokenizer(
                         question_0,
                         seq_1,
@@ -2030,7 +1704,7 @@ def test_maximum_encoding_length_pair_input(self):
                     )
 
                 # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                if isinstance(tokenizer, LayoutLMv2Tokenizer):
                     information = tokenizer(
                         question_0,
                         seq_1,
@@ -2090,7 +1764,7 @@ def test_maximum_encoding_length_pair_input(self):
                     # add_prefix_space=False,
                 )
                 # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                if isinstance(tokenizer, LayoutLMv2Tokenizer):
                     truncated_sequence = information_first_truncated["input_ids"][0]
                     overflowing_tokens = information_first_truncated["input_ids"][1]
                     bbox = information_first_truncated["bbox"][0]
@@ -2130,7 +1804,7 @@ def test_maximum_encoding_length_pair_input(self):
                     # add_prefix_space=False,
                 )
                 # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                if isinstance(tokenizer, LayoutLMv2Tokenizer):
                     truncated_sequence = information_second_truncated["input_ids"][0]
                     overflowing_tokens = information_second_truncated["input_ids"][1]
                     bbox = information_second_truncated["bbox"][0]
@@ -2253,7 +1927,7 @@ def test_maximum_encoding_length_single_input(self):
                 )
 
                 # Overflowing tokens are handled quite differently in slow and fast tokenizers
-                if isinstance(tokenizer, LayoutLMv2TokenizerFast):
+                if isinstance(tokenizer, LayoutLMv2Tokenizer):
                     truncated_sequence = information["input_ids"][0]
                     overflowing_tokens = information["input_ids"][1]
                     bbox = information["bbox"][0]
@@ -2311,7 +1985,7 @@ def test_only_label_first_subword(self):
         self.assertListEqual(encoding.labels, [-100, 0, 1, 1, -100])
 
         # test fast tokenizer
-        tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        tokenizer_r = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
         self.assertListEqual(encoding.labels, [-100, 0, 1, -100, -100])
 
@@ -2324,7 +1998,7 @@ def test_only_label_first_subword(self):
     @slow
     def test_layoutlmv2_integration_test(self):
         tokenizer_p = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
-        tokenizer_r = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        tokenizer_r = LayoutLMv2Tokenizer.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         # There are 3 cases:
         # CASE 1: document image classification (training + inference), document image token classification (inference),
@@ -2458,3 +2132,97 @@ def test_empty_input_string(self):
                 for return_type, target_type in zip(tokenizer_return_type, output_tensor_type):
                     output = tokenizer(words, boxes=boxes, padding=True, return_tensors=return_type)
                     self.assertEqual(output.input_ids.dtype, target_type)
+
+    def test_integration(self):
+        """Integration test with hardcoded expectations for LayoutLMv2."""
+        input_words = ["a", "weirdly", "test", "hello", "my", "name", "is", "bob"]
+        input_boxes = [
+            [423, 237, 440, 251],
+            [427, 272, 441, 287],
+            [419, 115, 437, 129],
+            [961, 885, 992, 912],
+            [256, 38, 330, 58],
+            [256, 38, 330, 58],
+            [336, 42, 353, 57],
+            [34, 42, 66, 69],
+        ]
+        expected_tokens = [
+            "a",
+            "weird",
+            "##ly",
+            "test",
+            "hello",
+            "my",
+            "name",
+            "is",
+            "bob",
+        ]
+        expected_ids = [1037, 6881, 2135, 3231, 7592, 2026, 2171, 2003, 3960]
+        expected_tokens_from_ids = ['a', 'weird', '##ly', 'test', 'hello', 'my', 'name', 'is', 'bob']  # fmt: skip
+        expected_decoded_text = "a weirdly test hello my name is bob"
+
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+        # 1) tokens (flattened per word)
+        tokens = []
+        for word in input_words:
+            tokens.extend(tokenizer.tokenize(word))
+        self.assertListEqual(tokens, expected_tokens)
+
+        # 2) ids from encode on pretokenized words with boxes
+        ids = tokenizer.encode(input_words, boxes=input_boxes, add_special_tokens=False)
+        self.assertListEqual(ids, expected_ids)
+
+        # 3) tokens from ids
+        roundtrip_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(roundtrip_tokens, expected_tokens_from_ids)
+
+        # 4) decoded text
+        decoded_text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        self.assertEqual(decoded_text, expected_decoded_text)
+
+    def test_integration_from_extractor(self):
+        """Integration test using pretokenized words and boxes as if coming from an extractor."""
+        input_words = ["a", "weirdly", "test", "hello", "my", "name", "is", "bob"]
+        input_boxes = [
+            [423, 237, 440, 251],
+            [427, 272, 441, 287],
+            [419, 115, 437, 129],
+            [961, 885, 992, 912],
+            [256, 38, 330, 58],
+            [256, 38, 330, 58],
+            [336, 42, 353, 57],
+            [34, 42, 66, 69],
+        ]
+
+        expected_tokens = [
+            "a",
+            "weird",
+            "##ly",
+            "test",
+            "hello",
+            "my",
+            "name",
+            "is",
+            "bob",
+        ]
+        expected_ids = [1037, 6881, 2135, 3231, 7592, 2026, 2171, 2003, 3960]
+        expected_tokens_from_ids = ['a', 'weird', '##ly', 'test', 'hello', 'my', 'name', 'is', 'bob']  # fmt: skip
+        expected_decoded_text = "a weirdly test hello my name is bob"
+
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+        # As if produced by an image/box extractor upstream
+        tokens = []
+        for word in input_words:
+            tokens.extend(tokenizer.tokenize(word))
+        self.assertListEqual(tokens, expected_tokens)
+
+        ids = tokenizer.encode(input_words, boxes=input_boxes, add_special_tokens=False)
+        self.assertListEqual(ids, expected_ids)
+
+        roundtrip_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(roundtrip_tokens, expected_tokens_from_ids)
+
+        decoded_text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        self.assertEqual(decoded_text, expected_decoded_text)
diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
index 5ca0499805ef..2279b27fee3a 100644
--- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
+++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import inspect
-import json
 import os
 import re
 import shutil
@@ -25,7 +24,6 @@
 from transformers import (
     AddedToken,
     LayoutLMv3TokenizerFast,
-    SpecialTokensMixin,
     is_mlx_available,
     is_torch_available,
     logging,
@@ -34,14 +32,13 @@
 from transformers.testing_utils import (
     require_pandas,
     require_tokenizers,
-    require_torch,
     slow,
 )
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 from ...test_tokenization_common import (
     SMALL_TRAINING_CORPUS,
     TokenizerTesterMixin,
-    merge_model_tokenizer_mappings,
 )
 
 
@@ -66,6 +63,97 @@ def get_words_and_boxes(self):
 
         return words, boxes
 
+    def test_integration(self):
+        """Integration test with hardcoded expectations for LayoutLMv3."""
+        input_words = ["a", "weirdly", "test", "hello", "my", "name", "is", "bob"]
+        input_boxes = [
+            [423, 237, 440, 251],
+            [427, 272, 441, 287],
+            [419, 115, 437, 129],
+            [961, 885, 992, 912],
+            [256, 38, 330, 58],
+            [256, 38, 330, 58],
+            [336, 42, 353, 57],
+            [34, 42, 66, 69],
+        ]
+
+        expected_tokens = [
+            "Ġa",
+            "Ġweird",
+            "ly",
+            "Ġtest",
+            "Ġhello",
+            "Ġmy",
+            "Ġname",
+            "Ġis",
+            "Ġbob",
+        ]
+        expected_ids = [10, 7735, 352, 1296, 20760, 127, 766, 16, 22401]
+        expected_tokens_from_ids = ['Ġa', 'Ġweird', 'ly', 'Ġtest', 'Ġhello', 'Ġmy', 'Ġname', 'Ġis', 'Ġbob']  # fmt: skip
+        expected_decoded_text = " a weirdly test hello my name is bob"
+
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv3-base", cls_token="")
+
+        # Flatten tokens per word
+        tokens = []
+        for word in input_words:
+            tokens.extend(tokenizer.tokenize(word))
+        self.assertListEqual(tokens, expected_tokens)
+
+        ids = tokenizer.encode(input_words, boxes=input_boxes, add_special_tokens=False)
+        self.assertListEqual(ids, expected_ids)
+
+        roundtrip_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(roundtrip_tokens, expected_tokens_from_ids)
+
+        decoded_text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        self.assertEqual(decoded_text, expected_decoded_text)
+
+    def test_integration_from_extractor(self):
+        """Integration test using pretokenized words and boxes as if coming from an extractor."""
+        input_words = ["a", "weirdly", "test", "hello", "my", "name", "is", "bob"]
+        input_boxes = [
+            [423, 237, 440, 251],
+            [427, 272, 441, 287],
+            [419, 115, 437, 129],
+            [961, 885, 992, 912],
+            [256, 38, 330, 58],
+            [256, 38, 330, 58],
+            [336, 42, 353, 57],
+            [34, 42, 66, 69],
+        ]
+
+        expected_tokens = [
+            "Ġa",
+            "Ġweird",
+            "ly",
+            "Ġtest",
+            "Ġhello",
+            "Ġmy",
+            "Ġname",
+            "Ġis",
+            "Ġbob",
+        ]
+        expected_ids = [10, 7735, 352, 1296, 20760, 127, 766, 16, 22401]
+        expected_tokens_from_ids = ['Ġa', 'Ġweird', 'ly', 'Ġtest', 'Ġhello', 'Ġmy', 'Ġname', 'Ġis', 'Ġbob']  # fmt: skip
+        expected_decoded_text = " a weirdly test hello my name is bob"
+
+        tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutlmv3-base", cls_token="")
+
+        tokens = []
+        for word in input_words:
+            tokens.extend(tokenizer.tokenize(word))
+        self.assertListEqual(tokens, expected_tokens)
+
+        ids = tokenizer.encode(input_words, boxes=input_boxes, add_special_tokens=False)
+        self.assertListEqual(ids, expected_ids)
+
+        roundtrip_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(roundtrip_tokens, expected_tokens_from_ids)
+
+        decoded_text = tokenizer.decode(ids, clean_up_tokenization_spaces=False)
+        self.assertEqual(decoded_text, expected_decoded_text)
+
     def get_words_and_boxes_batch(self):
         words = [["lower", "newer"], ["new", "low"]]
         boxes = [
@@ -152,15 +240,15 @@ def setUpClass(cls):
             "",
         ]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
+        merges_list = [("\u0120", "l"), ("\u0120l", "o"), ("\u0120lo", "w"), ("e", "r")]
         cls.special_tokens_map = {"unk_token": ""}
 
+        # Create tokenizer and save it (which will create all necessary files including tokenizer.json)
+        tokenizer = cls.tokenizer_class(vocab=vocab_tokens, merges=merges_list, **cls.special_tokens_map)
+        tokenizer.save_pretrained(cls.tmpdirname)
+
         cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
         cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
 
     @classmethod
     def get_tokenizer(cls, pretrained_name=None, **kwargs):
@@ -179,12 +267,45 @@ def get_input_output_texts(self, tokenizer):
         output_text = "lower newer"
         return input_text, output_text
 
+    def convert_batch_encode_plus_format_to_encode_plus(self, batch_encode_plus_sequences):
+        """Helper method to convert batch_encode_plus output to list of encode_plus outputs"""
+        # Get the batch size
+        first_key = list(batch_encode_plus_sequences.keys())[0]
+        batch_size = len(batch_encode_plus_sequences[first_key])
+
+        # Convert to list of dicts
+        encode_plus_sequences = []
+        for i in range(batch_size):
+            single_sequence = {}
+            for key, value in batch_encode_plus_sequences.items():
+                if key != "encodings":  # Skip the encodings attribute
+                    single_sequence[key] = value[i]
+            encode_plus_sequences.append(single_sequence)
+
+        return encode_plus_sequences
+
     @unittest.skip(reason="Chat template tests don't play well with table/layout models.")
     def test_chat_template_batched(self):
         pass
 
+    @unittest.skip(reason="LayoutLMv3 requires pre-tokenized words with boxes.")
+    def test_bos_token_with_add_bos_token_false(self):
+        pass
+
+    @unittest.skip(reason="LayoutLMv3 requires pre-tokenized words with boxes.")
+    def test_bos_token_with_add_bos_token_true(self):
+        pass
+
+    @unittest.skip(reason="LayoutLMv3 requires pre-tokenized words with boxes.")
+    def test_encode_basic_padding(self):
+        pass
+
+    @unittest.skip(reason="LayoutLMv3 requires pre-tokenized words with boxes.")
+    def test_pad_token_initialization(self):
+        pass
+
     def test_full_tokenizer(self):
-        tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        tokenizer = self.get_tokenizer()
         text = "lower newer"
         bpe_tokens = ["Ġlow", "er", "Ġ", "n", "e", "w", "er"]
         tokens = tokenizer.tokenize(text)  # , add_prefix_space=True)
@@ -668,49 +789,6 @@ def test_padding(self, max_length=50):
 
                 self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
 
-    def test_padding_warning_message_fast_tokenizer(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        words, boxes = self.get_words_and_boxes_batch()
-
-        tokenizer_fast = self.get_rust_tokenizer()
-
-        encoding_fast = tokenizer_fast(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs("transformers", level="WARNING") as cm:
-            tokenizer_fast.pad(encoding_fast)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
-            " encode the text followed by a call to the `pad` method to get a padded encoding.",
-            cm.records[0].message,
-        )
-
-        if not self.test_slow_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        tokenizer_slow = self.get_tokenizer()
-
-        encoding_slow = tokenizer_slow(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs(level="WARNING") as cm:
-            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
-            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
-            logger.warning("Dummy warning")
-            tokenizer_slow.pad(encoding_slow)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Dummy warning",
-            cm.records[0].message,
-        )
-
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -891,39 +969,6 @@ def test_padding_to_multiple_of(self):
                         pad_to_multiple_of=8,
                     )
 
-    def test_tokenizer_slow_store_full_signature(self):
-        signature = inspect.signature(self.tokenizer_class.__init__)
-        tokenizer = self.get_tokenizer()
-
-        for parameter_name, parameter in signature.parameters.items():
-            if parameter.default != inspect.Parameter.empty:
-                self.assertIn(parameter_name, tokenizer.init_kwargs)
-
-    def test_build_inputs_with_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                # Input tokens id
-                words, boxes = self.get_words_and_boxes()
-                input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
-                input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
-
-                # Generate output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                self.assertEqual(output_p, output_r)
-
-                # Generate pair output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
-
     def test_special_tokens_mask_input_pairs(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
@@ -1131,160 +1176,6 @@ def test_offsets_mapping(self):
                 # Assert there is online added_tokens special_tokens
                 self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
 
-    @require_torch
-    @slow
-    def test_torch_encode_plus_sent_to_model(self):
-        import torch
-
-        from transformers import MODEL_MAPPING, TOKENIZER_MAPPING
-
-        MODEL_TOKENIZER_MAPPING = merge_model_tokenizer_mappings(MODEL_MAPPING, TOKENIZER_MAPPING)
-
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
-                    self.skipTest(f"{tokenizer.__class__} is not in the MODEL_TOKENIZER_MAPPING")
-
-                config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
-                config = config_class()
-
-                if config.is_encoder_decoder or config.pad_token_id is None:
-                    self.skipTest(reason="Model is an encoder-decoder or has no pad token id set.")
-
-                model = model_class(config)
-
-                # Make sure the model contains at least the full vocabulary size in its embedding matrix
-                is_using_common_embeddings = hasattr(model.get_input_embeddings(), "weight")
-                assert (
-                    (model.get_input_embeddings().weight.shape[0] >= len(tokenizer))
-                    if is_using_common_embeddings
-                    else True
-                )
-
-                # Build sequence
-                words, boxes = self.get_words_and_boxes()
-                encoded_sequence = tokenizer.encode_plus(words, boxes=boxes, return_tensors="pt")
-                batch_encoded_sequence = tokenizer.batch_encode_plus(
-                    [words, words], boxes=[boxes, boxes], return_tensors="pt"
-                )
-
-                # We add dummy pixel_values keys (as LayoutLMv3 actually also requires a feature extractor
-                # to prepare the image input)
-                encoded_sequence["pixel_values"] = torch.randn(1, 3, 224, 224)
-                batch_encoded_sequence["pixel_values"] = torch.randn(2, 3, 224, 224)
-
-                # This should not fail
-                with torch.no_grad():  # saves some time
-                    model(**encoded_sequence)
-                    model(**batch_encoded_sequence)
-
-    def test_rust_and_python_full_tokenizers(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        words, boxes = self.get_words_and_boxes()
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        self.assertListEqual(ids, rust_ids)
-
-    def test_tokenization_python_rust_equals(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                words, boxes = self.get_words_and_boxes()
-
-                # Ensure basic input match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-
-                words = ["hello" for _ in range(1000)]
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
-
-                # Ensure truncation match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                # Ensure truncation with stride match
-                input_p = tokenizer_p.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-                input_r = tokenizer_r.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-
-    def test_embedded_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                words, boxes = self.get_words_and_boxes()
-                tokens_r = tokenizer_r.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-                tokens_p = tokenizer_p.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-
-                for key in tokens_p:
-                    self.assertEqual(tokens_r[key], tokens_p[key])
-
-                if "token_type_ids" in tokens_r:
-                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_r, tokens_p)
-
     def test_compare_add_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1383,25 +1274,6 @@ def test_special_tokens_initialization(self):
 
                 self.assertTrue(special_token_id in r_output)
 
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    words = "Hey this is a  token".split()
-                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                    p_output = tokenizer_p.encode(words, boxes=boxes)
-                    cr_output = tokenizer_cr.encode(words, boxes=boxes)
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
@@ -1433,8 +1305,8 @@ def test_training_new_tokenizer(self):
 
         # Assert the set of special tokens match as we didn't ask to change them
         self.assertSequenceEqual(
-            tokenizer.all_special_tokens_extended,
-            new_tokenizer.all_special_tokens_extended,
+            tokenizer.all_special_tokens,
+            new_tokenizer.all_special_tokens,
         )
 
         self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
@@ -1456,8 +1328,9 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
             self.assertEqual(new_tokenizer.cls_token_id, cls_id)
 
         # Create a new mapping from the special tokens defined in the original tokenizer
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
+        special_tokens_list = list(PreTrainedTokenizerBase.SPECIAL_TOKENS_ATTRIBUTES)
+        if "additional_special_tokens" in special_tokens_list:
+            special_tokens_list.remove("additional_special_tokens")
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
@@ -1484,12 +1357,12 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
 
         # Check if the AddedToken / string format has been kept
-        for special_token in tokenizer.all_special_tokens_extended:
+        for special_token in tokenizer.all_special_tokens:
             if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer.all_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens}",
                 )
             elif isinstance(special_token, AddedToken):
                 # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
@@ -1498,7 +1371,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 new_special_token_str = special_tokens_map[special_token_str]
 
                 find = False
-                for candidate in new_tokenizer.all_special_tokens_extended:
+                for candidate in new_tokenizer.all_special_tokens:
                     if (
                         isinstance(candidate, AddedToken)
                         and candidate.content == new_special_token_str
@@ -1512,19 +1385,19 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{new_special_token_str}' doesn't appear in the list "
-                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
-                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
+                    f"'{new_tokenizer.all_special_tokens}' as an AddedToken with the same parameters as "
+                    f"'{special_token}' in the list {tokenizer.all_special_tokens}",
                 )
             elif special_token not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer.all_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens}",
                 )
 
             else:
                 # The special token must appear in the list of the new tokenizer as an object of type string.
-                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
+                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens)
 
         # Test we can use the new tokenizer with something not seen during training
         words = [["this", "is"], ["hello", "🤗"]]
@@ -1538,55 +1411,9 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
             expected_result = tokenizer.backend_tokenizer.normalizer.normalize_str(expected_result)
         self.assertEqual(expected_result, decoded_input)
 
+    @unittest.skip(reason="LayoutLMv3Tokenizer no longer has separate slow tokenizer with prepare_for_model")
     def test_prepare_for_model(self):
-        tokenizers = self.get_tokenizers(do_lower_case=False)
-        for tokenizer in tokenizers:
-            # only test prepare_for_model for the slow tokenizer
-            if tokenizer.__class__.__name__ == "LayoutLMv3TokenizerFast":
-                continue
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                words, boxes = self.get_words_and_boxes()
-                prepared_input_dict = tokenizer.prepare_for_model(words, boxes=boxes, add_special_tokens=True)
-
-                input_dict = tokenizer.encode_plus(words, boxes=boxes, add_special_tokens=True)
-
-                self.assertEqual(input_dict, prepared_input_dict)
-
-    def test_padding_different_model_input_name(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                words, boxes = self.get_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-                input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-
-                # rename encoded batch to "inputs"
-                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
-                del input_r[tokenizer_r.model_input_names[0]]
-
-                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
-                del input_p[tokenizer_p.model_input_names[0]]
-
-                # Renaming `input_ids` to `inputs`
-                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
-                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
-
-                input_r = tokenizer_r.pad(input_r, padding="longest")
-                input_p = tokenizer_r.pad(input_p, padding="longest")
-
-                max_length = len(input_p["inputs"][0])
-                self.assert_batch_padded_input_match(
-                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
-                )
+        pass
 
     def test_batch_encode_dynamic_overflowing(self):
         """
@@ -2237,7 +2064,7 @@ def test_only_label_first_subword(self):
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
         self.assertListEqual(encoding.labels, [-100, 0, 1, -100, 2, -100, -100])
 
-        tokenizer_r = LayoutLMv3Tokenizer.from_pretrained(
+        tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained(
             "microsoft/layoutlmv3-base",
             only_label_first_subword=False,
             add_visual_labels=False,
@@ -2245,10 +2072,8 @@ def test_only_label_first_subword(self):
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
         self.assertListEqual(encoding.labels, [-100, 0, 1, 1, 2, 2, -100])
 
-    @slow
     def test_layoutlmv3_integration_test(self):
         tokenizer_p = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
-        tokenizer_r = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
 
         # There are 3 cases:
         # CASE 1: document image classification (training + inference), document image token classification (inference),
@@ -2266,9 +2091,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
 
         encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 1: batched
         words, boxes = self.get_words_and_boxes_batch()
@@ -2276,9 +2099,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
 
         encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 2: not batched
         words, boxes = self.get_words_and_boxes()
@@ -2287,9 +2108,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
 
         encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
         # # CASE 2: batched
         words, boxes = self.get_words_and_boxes_batch()
@@ -2298,9 +2117,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [[0, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 92, 614, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, 46, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
 
         encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
         # # CASE 3: not batched
         question, words, boxes = self.get_question_words_and_boxes()
@@ -2308,9 +2125,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
 
         encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
         # # CASE 3: batched
         questions, words, boxes = self.get_question_words_and_boxes_batch()
@@ -2318,9 +2133,7 @@ def test_layoutlmv3_integration_test(self):
         expected_results = {'input_ids': [[0, 99, 18, 39, 766, 116, 2, 2, 795, 13964, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 141, 16, 37, 373, 116, 2, 2, 13964, 795, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [256, 38, 330, 58], [256, 38, 330, 58], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
 
         encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
-        encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
         self.assertDictEqual(dict(encoding_p), expected_results)
-        self.assertDictEqual(dict(encoding_r), expected_results)
 
     @unittest.skip(reason="Doesn't support returning Numpy arrays")
     def test_np_encode_plus_sent_to_model(self):
diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
index da77177a3f62..216e75477fd8 100644
--- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py
+++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py
@@ -21,13 +21,12 @@
 
 from transformers import (
     AddedToken,
-    LayoutXLMTokenizerFast,
-    SpecialTokensMixin,
+    LayoutXLMTokenizer,
+    PreTrainedTokenizerBase,
     is_mlx_available,
     is_torch_available,
     logging,
 )
-from transformers.models.layoutxlm.tokenization_layoutxlm import LayoutXLMTokenizer
 from transformers.testing_utils import (
     get_tests_dir,
     require_pandas,
@@ -36,13 +35,14 @@
     require_torch,
     slow,
 )
+from transformers.tokenization_utils_sentencepiece import SentencePieceExtractor
 
 from ...test_tokenization_common import (
-    SMALL_TRAINING_CORPUS,
     TokenizerTesterMixin,
     filter_non_english,
     merge_model_tokenizer_mappings,
 )
+from ...test_tokenizers_backend_mixin import SMALL_TRAINING_CORPUS
 
 
 logger = logging.get_logger(__name__)
@@ -55,8 +55,9 @@
 class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = "FacebookAI/xlm-roberta-base"
     tokenizer_class = LayoutXLMTokenizer
-    rust_tokenizer_class = LayoutXLMTokenizerFast
+    rust_tokenizer_class = LayoutXLMTokenizer
     test_rust_tokenizer = True
+    test_slow_tokenizer = False
     from_pretrained_filter = filter_non_english
     test_seq2seq = False
     test_sentencepiece = True
@@ -130,10 +131,31 @@ def get_empty_question_words_and_boxes_batch(self):
     def setUpClass(cls):
         super().setUpClass()
 
-        # We have a SentencePiece fixture for testing
-        tokenizer = LayoutXLMTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        # Extract vocab from SentencePiece model
+        extractor = SentencePieceExtractor(SAMPLE_VOCAB)
+        vocab_ids, vocab_scores, merges = extractor.extract()
+
+        # Create tokenizer from vocab
+        tokenizer = LayoutXLMTokenizer(vocab=vocab_scores)
         tokenizer.save_pretrained(cls.tmpdirname)
 
+    def convert_batch_encode_plus_format_to_encode_plus(self, batch_encode_plus_sequences):
+        """Helper method to convert batch_encode_plus output to list of encode_plus outputs"""
+        # Get the batch size
+        first_key = list(batch_encode_plus_sequences.keys())[0]
+        batch_size = len(batch_encode_plus_sequences[first_key])
+
+        # Convert to list of dicts
+        encode_plus_sequences = []
+        for i in range(batch_size):
+            single_sequence = {}
+            for key, value in batch_encode_plus_sequences.items():
+                if key != "encodings":  # Skip the encodings attribute
+                    single_sequence[key] = value[i]
+            encode_plus_sequences.append(single_sequence)
+
+        return encode_plus_sequences
+
     def get_input_output_texts(self, tokenizer):
         input_text = "UNwant\u00e9d,running"
         output_text = "unwanted, running"
@@ -143,17 +165,101 @@ def get_input_output_texts(self, tokenizer):
     def test_chat_template_batched(self):
         pass
 
+    def test_bos_token_with_add_bos_token_true(self):
+        # LayoutXLM requires pretokenized input with boxes
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                words = ["hello", "world"]
+                boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+                # LayoutXLM doesn't use add_bos_token, it uses post_processor
+                # Just verify it can encode without error
+                encoded = tokenizer.encode(words, boxes=boxes)
+                self.assertIsInstance(encoded, list)
+
+    def test_bos_token_with_add_bos_token_false(self):
+        # LayoutXLM requires pretokenized input with boxes
+        tokenizers = self.get_tokenizers()
+        for tokenizer in tokenizers:
+            with self.subTest(f"{tokenizer.__class__.__name__}"):
+                words = ["hello", "world"]
+                boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]
+
+                # LayoutXLM doesn't use add_bos_token, it uses post_processor
+                # Just verify it can encode without error
+                encoded = tokenizer.encode(words, boxes=boxes)
+                self.assertIsInstance(encoded, list)
+
+    def test_pad_token_initialization(self):
+        """Test that passing pad_token when creating a tokenizer works correctly."""
+        # LayoutXLM requires pretokenized input with boxes
+        tokenizer = self.get_tokenizer(pad_token="[PAD]")
+        # Verify the pad_token was set correctly
+        self.assertEqual(tokenizer.pad_token, "[PAD]")
+        self.assertIsNotNone(tokenizer.pad_token_id)
+
+        # Test with two sequences of different lengths to trigger padding
+        seq_0 = ["Test", "this", "method"]
+        seq_1 = ["With", "these", "inputs", "and", "some", "extra"]
+        boxes_0 = [[1, 2, 3, 4] for _ in seq_0]
+        boxes_1 = [[1, 2, 3, 4] for _ in seq_1]
+
+        # Test padding works with the custom pad_token
+        output_with_padding = tokenizer(
+            [seq_0, seq_1],
+            boxes=[boxes_0, boxes_1],
+            padding=True,
+        )
+
+        # Check padding was applied correctly
+        self.assertEqual(len(output_with_padding["input_ids"][0]), len(output_with_padding["input_ids"][1]))
+
+    def test_encode_basic_padding(self):
+        """Test basic left/right padding behavior using encode() method with max_length strategy."""
+        tokenizer = self.get_tokenizer(do_lower_case=False)
+        # LayoutXLM requires pretokenized input with boxes
+        words = ["Sequence"]
+        boxes = [[1, 2, 3, 4]]
+        padding_size = 10
+
+        # check correct behaviour if no pad_token_id exists and add it eventually
+        self._check_no_pad_token_padding(tokenizer, words)
+
+        padding_idx = tokenizer.pad_token_id
+
+        # Test right padding
+        encoded_sequence = tokenizer.encode(words, boxes=boxes)
+        sequence_length = len(encoded_sequence)
+        padded_sequence = tokenizer.encode(
+            words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
+        )
+        padded_sequence_length = len(padded_sequence)
+        assert sequence_length + padding_size == padded_sequence_length
+        assert encoded_sequence + [padding_idx] * padding_size == padded_sequence
+
+        # Test left padding
+        tokenizer.padding_side = "left"
+        encoded_sequence = tokenizer.encode(words, boxes=boxes)
+        sequence_length = len(encoded_sequence)
+        padded_sequence = tokenizer.encode(
+            words, boxes=boxes, max_length=sequence_length + padding_size, padding="max_length"
+        )
+        padded_sequence_length = len(padded_sequence)
+        assert sequence_length + padding_size == padded_sequence_length
+        assert [padding_idx] * padding_size + encoded_sequence == padded_sequence
+
     # override test in `test_tokenization_common.py` because of the required input format of the `__call__`` method of
     # this tokenizer
     def test_save_sentencepiece_tokenizer(self) -> None:
-        if not self.test_sentencepiece or not self.test_slow_tokenizer:
-            self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
+        if not self.test_sentencepiece:
+            self.skipTest(reason="test_sentencepiece is set to False")
         # We want to verify that we will be able to save the tokenizer even if the original files that were used to
         # build the tokenizer have been deleted in the meantime.
         words, boxes = self.get_words_and_boxes()
 
-        tokenizer_slow_1 = self.get_tokenizer()
-        encoding_tokenizer_slow_1 = tokenizer_slow_1(
+        tokenizer_1 = self.get_tokenizer()
+        encoding_tokenizer_1 = tokenizer_1(
             words,
             boxes=boxes,
         )
@@ -161,54 +267,45 @@ def test_save_sentencepiece_tokenizer(self) -> None:
         tmpdirname_1 = tempfile.mkdtemp()
         tmpdirname_2 = tempfile.mkdtemp()
 
-        tokenizer_slow_1.save_pretrained(tmpdirname_1)
-        tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
-        encoding_tokenizer_slow_2 = tokenizer_slow_2(
+        tokenizer_1.save_pretrained(tmpdirname_1)
+        tokenizer_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
+        encoding_tokenizer_2 = tokenizer_2(
             words,
             boxes=boxes,
         )
 
         shutil.rmtree(tmpdirname_1)
-        tokenizer_slow_2.save_pretrained(tmpdirname_2)
+        tokenizer_2.save_pretrained(tmpdirname_2)
 
-        tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
-        encoding_tokenizer_slow_3 = tokenizer_slow_3(
+        tokenizer_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
+        encoding_tokenizer_3 = tokenizer_3(
             words,
             boxes=boxes,
         )
         shutil.rmtree(tmpdirname_2)
 
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
-        self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
+        self.assertEqual(encoding_tokenizer_1, encoding_tokenizer_2)
+        self.assertEqual(encoding_tokenizer_1, encoding_tokenizer_3)
 
     def test_split_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             special_token = ""
             special_sentence = f"Hey this is a {special_token} token"
-            _, _, boxes = self.get_question_words_and_boxes()
 
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_rust = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
-                )
-                tokenizer_py = self.tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
+                tokenizer_r = self.get_tokenizer(
+                    pretrained_name, extra_special_tokens=[special_token], split_special_tokens=True, **kwargs
                 )
 
-                py_tokens_output = tokenizer_py.tokenize(special_sentence)
-                rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
-
-                self.assertTrue(special_token not in py_tokens_output)
-                self.assertTrue(special_token not in rust_tokens_output)
+                # For LayoutXLM, tokenize works with strings (not requiring boxes)
+                r_tokens_output = tokenizer_r.tokenize(special_sentence)
+                self.assertTrue(special_token not in r_tokens_output)
 
-                py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
-                rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
-
-                self.assertTrue(special_token in py_tokens_output_unsplit)
-                self.assertTrue(special_token in rust_tokens_output_unsplit)
+                r_tokens_output_unsplit = tokenizer_r.tokenize(special_sentence, split_special_tokens=False)
+                self.assertTrue(special_token in r_tokens_output_unsplit)
 
                 tmpdirname = tempfile.mkdtemp()
-                tokenizer_py.save_pretrained(tmpdirname)
+                tokenizer_r.save_pretrained(tmpdirname)
                 fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname)
 
                 output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
@@ -237,7 +334,7 @@ def test_sequence_builders(self):
     def test_offsets_with_special_characters(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
 
                 words, boxes = self.get_words_and_boxes()
                 words[1] = tokenizer_r.mask_token
@@ -532,7 +629,7 @@ def test_number_of_added_tokens(self):
     def test_padding(self, max_length=50):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
                 tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
 
                 self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
@@ -712,49 +809,6 @@ def test_padding(self, max_length=50):
 
                 self.assert_batch_padded_input_match(input_r, input_p, max_length, pad_token_id)
 
-    def test_padding_warning_message_fast_tokenizer(self):
-        if not self.test_rust_tokenizer:
-            self.skipTest(reason="test_rust_tokenizer is set to False")
-
-        words, boxes = self.get_words_and_boxes_batch()
-
-        tokenizer_fast = self.get_rust_tokenizer()
-
-        encoding_fast = tokenizer_fast(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs("transformers", level="WARNING") as cm:
-            tokenizer_fast.pad(encoding_fast)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to"
-            " encode the text followed by a call to the `pad` method to get a padded encoding.",
-            cm.records[0].message,
-        )
-
-        if not self.test_slow_tokenizer:
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
-        tokenizer_slow = self.get_tokenizer()
-
-        encoding_slow = tokenizer_slow(
-            words,
-            boxes=boxes,
-        )
-
-        with self.assertLogs(level="WARNING") as cm:
-            # We want to assert there are no warnings, but the 'assertLogs' method does not support that.
-            # Therefore, we are adding a dummy warning, and then we will assert it is the only warning.
-            logger.warning("Dummy warning")
-            tokenizer_slow.pad(encoding_slow)
-        self.assertEqual(len(cm.records), 1)
-        self.assertIn(
-            "Dummy warning",
-            cm.records[0].message,
-        )
-
     def test_call(self):
         # Tests that all call wrap to encode_plus and batch_encode_plus
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -944,29 +998,20 @@ def test_tokenizer_slow_store_full_signature(self):
                 self.assertIn(parameter_name, tokenizer.init_kwargs)
 
     def test_build_inputs_with_special_tokens(self):
-        if not self.test_slow_tokenizer:
-            # as we don't have a slow version, we can't compare the outputs between slow and fast versions
-            self.skipTest(reason="test_slow_tokenizer is set to False")
-
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
+                tokenizer_r = self.get_tokenizer(pretrained_name, **kwargs)
 
                 # Input tokens id
                 words, boxes = self.get_words_and_boxes()
-                input_simple = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
-                input_pair = tokenizer_p.encode(words, boxes=boxes, add_special_tokens=False)
+                input_simple = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
+                input_pair = tokenizer_r.encode(words, boxes=boxes, add_special_tokens=False)
 
                 # Generate output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple)
-                self.assertEqual(output_p, output_r)
+                _ = tokenizer_r.build_inputs_with_special_tokens(input_simple)
 
                 # Generate pair output
-                output_r = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
-                output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
-                self.assertEqual(output_p, output_r)
+                _ = tokenizer_r.build_inputs_with_special_tokens(input_simple, input_pair)
 
     def test_special_tokens_mask_input_pairs(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
@@ -1231,104 +1276,16 @@ def test_rust_and_python_full_tokenizers(self):
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
             self.skipTest(reason="test_slow_tokenizer is set to False")
 
-        tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer()
-
-        words, boxes = self.get_words_and_boxes()
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=False)
-        self.assertListEqual(ids, rust_ids)
-
-        ids = tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        rust_ids = rust_tokenizer.encode(words, boxes=boxes, add_special_tokens=True)
-        self.assertListEqual(ids, rust_ids)
-
     def test_tokenization_python_rust_equals(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
             self.skipTest(reason="test_slow_tokenizer is set to False")
 
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                words, boxes = self.get_words_and_boxes()
-
-                # Ensure basic input match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                input_pairs_p = tokenizer_p.encode_plus(words, boxes=boxes)
-                input_pairs_r = tokenizer_r.encode_plus(words, boxes=boxes)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
-
-                words = ["hello" for _ in range(1000)]
-                boxes = [[1000, 1000, 1000, 1000] for _ in range(1000)]
-
-                # Ensure truncation match
-                input_p = tokenizer_p.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-                input_r = tokenizer_r.encode_plus(words, boxes=boxes, max_length=512, truncation=True)
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key])
-
-                # Ensure truncation with stride match
-                input_p = tokenizer_p.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-                input_r = tokenizer_r.encode_plus(
-                    words, boxes=boxes, max_length=512, truncation=True, stride=3, return_overflowing_tokens=True
-                )
-
-                for key in filter(
-                    lambda x: x in ["input_ids", "token_type_ids", "attention_mask", "bbox"], input_p.keys()
-                ):
-                    self.assertSequenceEqual(input_p[key], input_r[key][0])
-
     def test_embedded_special_tokens(self):
         if not self.test_slow_tokenizer:
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
             self.skipTest(reason="test_slow_tokenizer is set to False")
 
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                words, boxes = self.get_words_and_boxes()
-                tokens_r = tokenizer_r.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-                tokens_p = tokenizer_p.encode_plus(
-                    words,
-                    boxes=boxes,
-                    add_special_tokens=True,
-                )
-
-                for key in tokens_p:
-                    self.assertEqual(tokens_r[key], tokens_p[key])
-
-                if "token_type_ids" in tokens_r:
-                    self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-
-                tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_r, tokens_p)
-
     def test_compare_add_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -1415,7 +1372,7 @@ def test_special_tokens_initialization(self):
                 added_tokens = [AddedToken("", lstrip=True)]
 
                 tokenizer_r = self.rust_tokenizer_class.from_pretrained(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
+                    pretrained_name, extra_special_tokens=added_tokens, **kwargs
                 )
                 words = "Hey this is a  token".split()
                 boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
@@ -1427,31 +1384,12 @@ def test_special_tokens_initialization(self):
 
                 self.assertTrue(special_token_id in r_output)
 
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.rust_tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs, from_slow=True
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    words = "Hey this is a  token".split()
-                    boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
-
-                    p_output = tokenizer_p.encode(words, boxes=boxes)
-                    cr_output = tokenizer_cr.encode(words, boxes=boxes)
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
     def test_training_new_tokenizer(self):
         # This feature only exists for fast tokenizers
         if not self.test_rust_tokenizer:
             self.skipTest(reason="test_rust_tokenizer is set to False")
 
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
 
         # Test we can use the new tokenizer with something not seen during training
@@ -1472,15 +1410,18 @@ def test_training_new_tokenizer(self):
         self.assertEqual(tokenizer.num_special_tokens_to_add(True), new_tokenizer.num_special_tokens_to_add(True))
 
         # Check we have the correct max_length for both pair and non-pair inputs.
-        self.assertEqual(tokenizer.max_len_single_sentence, new_tokenizer.max_len_single_sentence)
-        self.assertEqual(tokenizer.max_len_sentences_pair, new_tokenizer.max_len_sentences_pair)
-
-        # Assert the set of special tokens match as we didn't ask to change them
-        self.assertSequenceEqual(
-            tokenizer.all_special_tokens_extended,
-            new_tokenizer.all_special_tokens_extended,
+        # max_len_single_sentence = model_max_length - num_special_tokens_to_add(pair=False)
+        self.assertEqual(
+            tokenizer.model_max_length - tokenizer.num_special_tokens_to_add(pair=False),
+            new_tokenizer.model_max_length - new_tokenizer.num_special_tokens_to_add(pair=False),
+        )
+        # max_len_sentences_pair = model_max_length - num_special_tokens_to_add(pair=True)
+        self.assertEqual(
+            tokenizer.model_max_length - tokenizer.num_special_tokens_to_add(pair=True),
+            new_tokenizer.model_max_length - new_tokenizer.num_special_tokens_to_add(pair=True),
         )
 
+        # Assert the set of special tokens match as we didn't ask to change them
         self.assertDictEqual(tokenizer.special_tokens_map, new_tokenizer.special_tokens_map)
 
     def test_training_new_tokenizer_with_special_tokens_change(self):
@@ -1488,7 +1429,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
         if not self.test_rust_tokenizer:
             self.skipTest(reason="test_rust_tokenizer is set to False")
 
-        tokenizer = self.get_rust_tokenizer()
+        tokenizer = self.get_tokenizer()
         # Test with a special tokens map
         class_signature = inspect.signature(tokenizer.__class__)
         if "cls_token" in class_signature.parameters:
@@ -1500,8 +1441,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
             self.assertEqual(new_tokenizer.cls_token_id, cls_id)
 
         # Create a new mapping from the special tokens defined in the original tokenizer
-        special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy()
-        special_tokens_list.remove("additional_special_tokens")
+        special_tokens_list = PreTrainedTokenizerBase.SPECIAL_TOKENS_ATTRIBUTES.copy()
         special_tokens_map = {}
         for token in special_tokens_list:
             # Get the private one to avoid unnecessary warnings.
@@ -1528,12 +1468,25 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertEqual(getattr(new_tokenizer, f"{token}_id"), new_id)
 
         # Check if the AddedToken / string format has been kept
-        for special_token in tokenizer.all_special_tokens_extended:
+        tokenizer_special_tokens = [
+            tok
+            for value in tokenizer._special_tokens_map.values()
+            if value
+            for tok in (value if isinstance(value, (list, tuple)) else [value])
+        ]
+        new_tokenizer_special_tokens = [
+            tok
+            for value in new_tokenizer._special_tokens_map.values()
+            if value
+            for tok in (value if isinstance(value, (list, tuple)) else [value])
+        ]
+
+        for special_token in tokenizer_special_tokens:
             if isinstance(special_token, AddedToken) and special_token.content not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer_special_tokens}",
                 )
             elif isinstance(special_token, AddedToken):
                 # The special token must appear in the list of the new tokenizer as an object of type AddedToken with
@@ -1542,7 +1495,7 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 new_special_token_str = special_tokens_map[special_token_str]
 
                 find = False
-                for candidate in new_tokenizer.all_special_tokens_extended:
+                for candidate in new_tokenizer_special_tokens:
                     if (
                         isinstance(candidate, AddedToken)
                         and candidate.content == new_special_token_str
@@ -1556,19 +1509,19 @@ def test_training_new_tokenizer_with_special_tokens_change(self):
                 self.assertTrue(
                     find,
                     f"'{new_special_token_str}' doesn't appear in the list "
-                    f"'{new_tokenizer.all_special_tokens_extended}' as an AddedToken with the same parameters as "
-                    f"'{special_token}' in the list {tokenizer.all_special_tokens_extended}",
+                    f"'{new_tokenizer_special_tokens}' as an AddedToken with the same parameters as "
+                    f"'{special_token}' in the list {tokenizer_special_tokens}",
                 )
             elif special_token not in special_tokens_map:
                 # The special token must appear identically in the list of the new tokenizer.
                 self.assertTrue(
-                    special_token in new_tokenizer.all_special_tokens_extended,
-                    f"'{special_token}' should be in {new_tokenizer.all_special_tokens_extended}",
+                    special_token in new_tokenizer_special_tokens,
+                    f"'{special_token}' should be in {new_tokenizer_special_tokens}",
                 )
 
             else:
                 # The special token must appear in the list of the new tokenizer as an object of type string.
-                self.assertTrue(special_tokens_map[special_token] in new_tokenizer.all_special_tokens_extended)
+                self.assertTrue(special_tokens_map[special_token] in new_tokenizer_special_tokens)
 
         # Test we can use the new tokenizer with something not seen during training
         words = [["this", "is"], ["hello", "🤗"]]
@@ -1586,7 +1539,7 @@ def test_prepare_for_model(self):
         tokenizers = self.get_tokenizers(do_lower_case=False)
         for tokenizer in tokenizers:
             # only test prepare_for_model for the slow tokenizer
-            if tokenizer.__class__.__name__ == "LayoutXLMTokenizerFast":
+            if tokenizer.__class__.__name__ == "LayoutXLMTokenizer":
                 continue
             with self.subTest(f"{tokenizer.__class__.__name__}"):
                 words, boxes = self.get_words_and_boxes()
@@ -1601,37 +1554,6 @@ def test_padding_different_model_input_name(self):
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
             self.skipTest(reason="test_slow_tokenizer is set to False")
 
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                self.assertEqual(tokenizer_p.pad_token_id, tokenizer_r.pad_token_id)
-                pad_token_id = tokenizer_p.pad_token_id
-
-                words, boxes = self.get_words_and_boxes_batch()
-
-                input_r = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-                input_p = tokenizer_r.batch_encode_plus(words, boxes=boxes)
-
-                # rename encoded batch to "inputs"
-                input_r["inputs"] = input_r[tokenizer_r.model_input_names[0]]
-                del input_r[tokenizer_r.model_input_names[0]]
-
-                input_p["inputs"] = input_p[tokenizer_p.model_input_names[0]]
-                del input_p[tokenizer_p.model_input_names[0]]
-
-                # Renaming `input_ids` to `inputs`
-                tokenizer_r.model_input_names = ["inputs"] + tokenizer_r.model_input_names[1:]
-                tokenizer_p.model_input_names = ["inputs"] + tokenizer_p.model_input_names[1:]
-
-                input_r = tokenizer_r.pad(input_r, padding="longest")
-                input_p = tokenizer_r.pad(input_p, padding="longest")
-
-                max_length = len(input_p["inputs"][0])
-                self.assert_batch_padded_input_match(
-                    input_r, input_p, max_length, pad_token_id, model_main_input_name="inputs"
-                )
-
     def test_batch_encode_dynamic_overflowing(self):
         """
         When calling batch_encode with multiple sequences, it can return different number of
@@ -1694,72 +1616,6 @@ def test_save_pretrained(self):
             # as we don't have a slow version, we can't compare the outputs between slow and fast versions
             self.skipTest(reason="test_slow_tokenizer is set to False")
 
-        self.tokenizers_list[0] = (self.rust_tokenizer_class, "hf-internal-testing/tiny-random-layoutxlm", {})
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
-                    # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
     @unittest.skip(reason="TO DO: overwrite this very extensive test.")
     def test_alignment_methods(self):
         pass
@@ -1790,17 +1646,8 @@ def test_only_label_first_subword(self):
         boxes = [[1000, 1000, 1000, 1000] for _ in range(len(words))]
         word_labels = [0, 1]
 
-        # test slow tokenizer
-        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
-        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100])
-
-        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base", only_label_first_subword=False)
-        encoding = tokenizer_p(words, boxes=boxes, word_labels=word_labels)
-        self.assertListEqual(encoding.labels, [-100, 0, 0, 1, 1, -100])
-
         # test fast tokenizer
-        tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
+        tokenizer_r = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
         encoding = tokenizer_r(words, boxes=boxes, word_labels=word_labels)
         self.assertListEqual(encoding.labels, [-100, 0, -100, 1, -100, -100])
 
@@ -1810,8 +1657,7 @@ def test_only_label_first_subword(self):
 
     @slow
     def test_layoutxlm_integration_test(self):
-        tokenizer_p = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
-        tokenizer_r = LayoutXLMTokenizerFast.from_pretrained("microsoft/layoutxlm-base")
+        tokenizer_r = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
 
         # There are 3 cases:
         # CASE 1: document image classification (training + inference), document image token classification (inference),
@@ -1828,9 +1674,7 @@ def test_layoutxlm_integration_test(self):
 
         expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
 
-        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 1: batched
@@ -1838,9 +1682,7 @@ def test_layoutxlm_integration_test(self):
 
         expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
 
-        encoding_p = tokenizer_p(words, boxes=boxes, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(words, boxes=boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 2: not batched
@@ -1849,9 +1691,7 @@ def test_layoutxlm_integration_test(self):
 
         expected_results = {'input_ids': [0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'bbox': [[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'labels': [-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}  # fmt: skip
 
-        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 2: batched
@@ -1860,9 +1700,7 @@ def test_layoutxlm_integration_test(self):
 
         expected_results = {'input_ids': [[0, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 33600, 31, 759, 9351, 83, 21895, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'bbox': [[[0, 0, 0, 0], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [961, 885, 992, 912], [961, 885, 992, 912], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], 'labels': [[-100, 1, 2, -100, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], [-100, 2, -100, 46, 17, 22, 3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}  # fmt: skip
 
-        encoding_p = tokenizer_p(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(words, boxes=boxes, word_labels=word_labels, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 3: not batched
@@ -1870,18 +1708,14 @@ def test_layoutxlm_integration_test(self):
 
         expected_results = {'input_ids': [0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], 'bbox': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]}  # fmt: skip
 
-        encoding_p = tokenizer_p(question, words, boxes, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(question, words, boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
         # CASE 3: batched
         questions, words, boxes = self.get_question_words_and_boxes_batch()
 
         expected_results = {'input_ids': [[0, 2367, 25, 7, 1919, 9351, 32, 2, 2, 10, 179459, 538, 3034, 2, 1, 1, 1, 1, 1, 1], [0, 3642, 83, 764, 35839, 32, 2, 2, 2367, 10, 21, 3190, 53496, 19, 2, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]], 'bbox': [[[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [423, 237, 440, 251], [427, 272, 441, 287], [427, 272, 441, 287], [419, 115, 437, 129], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [1000, 1000, 1000, 1000], [1000, 1000, 1000, 1000], [256, 38, 330, 58], [256, 38, 330, 58], [336, 42, 353, 57], [336, 42, 353, 57], [34, 42, 66, 69], [34, 42, 66, 69], [1000, 1000, 1000, 1000], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]}  # fmt: skip
-        encoding_p = tokenizer_p(questions, words, boxes, padding="max_length", max_length=20)
         encoding_r = tokenizer_r(questions, words, boxes, padding="max_length", max_length=20)
-        self.assertDictEqual(dict(encoding_p), expected_results)
         self.assertDictEqual(dict(encoding_r), expected_results)
 
     @unittest.skip(reason="Doesn't support returning Numpy arrays")
diff --git a/tests/models/led/test_tokenization_led.py b/tests/models/led/test_tokenization_led.py
deleted file mode 100644
index 411b31d44f7a..000000000000
--- a/tests/models/led/test_tokenization_led.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import unittest
-from functools import cached_property
-
-from transformers import BatchEncoding, LEDTokenizer, LEDTokenizerFast
-from transformers.models.led.tokenization_led import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tokenizers, require_torch
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-@require_tokenizers
-class TestTokenizationLED(TokenizerTesterMixin, unittest.TestCase):
-    from_pretrained_id = "allenai/led-base-16384"
-    tokenizer_class = LEDTokenizer
-    rust_tokenizer_class = LEDTokenizerFast
-    test_rust_tokenizer = True
-
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-
-        vocab = [
-            "l",
-            "o",
-            "w",
-            "e",
-            "r",
-            "s",
-            "t",
-            "i",
-            "d",
-            "n",
-            "\u0120",
-            "\u0120l",
-            "\u0120n",
-            "\u0120lo",
-            "\u0120low",
-            "er",
-            "\u0120lowest",
-            "\u0120newer",
-            "\u0120wider",
-            "",
-        ]
-        vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
-        cls.special_tokens_map = {"unk_token": ""}
-
-        cls.vocab_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        cls.merges_file = os.path.join(cls.tmpdirname, VOCAB_FILES_NAMES["merges_file"])
-        with open(cls.vocab_file, "w", encoding="utf-8") as fp:
-            fp.write(json.dumps(vocab_tokens) + "\n")
-        with open(cls.merges_file, "w", encoding="utf-8") as fp:
-            fp.write("\n".join(merges))
-
-    @classmethod
-    def get_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return cls.tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-    @classmethod
-    def get_rust_tokenizer(cls, pretrained_name=None, **kwargs):
-        kwargs.update(cls.special_tokens_map)
-        pretrained_name = pretrained_name or cls.tmpdirname
-        return cls.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        return "lower newer", "lower newer"
-
-    @cached_property
-    def default_tokenizer(self):
-        return LEDTokenizer.from_pretrained("allenai/led-base-16384")
-
-    @cached_property
-    def default_tokenizer_fast(self):
-        return LEDTokenizerFast.from_pretrained("allenai/led-base-16384")
-
-    @require_torch
-    def test_prepare_batch(self):
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        expected_src_tokens = [0, 250, 251, 17818, 13, 39186, 1938, 4, 2]
-
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(src_text, max_length=len(expected_src_tokens), padding=True, return_tensors="pt")
-            self.assertIsInstance(batch, BatchEncoding)
-
-            self.assertEqual((2, 9), batch.input_ids.shape)
-            self.assertEqual((2, 9), batch.attention_mask.shape)
-            result = batch.input_ids.tolist()[0]
-            self.assertListEqual(expected_src_tokens, result)
-
-    @require_torch
-    def test_prepare_batch_empty_target_text(self):
-        src_text = ["A long paragraph for summarization.", "Another paragraph for summarization."]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(src_text, padding=True, return_tensors="pt")
-            self.assertIn("input_ids", batch)
-            self.assertIn("attention_mask", batch)
-            self.assertNotIn("labels", batch)
-            self.assertNotIn("decoder_attention_mask", batch)
-
-    @require_torch
-    def test_tokenizer_as_target_length(self):
-        tgt_text = [
-            "Summary of the text.",
-            "Another summary.",
-        ]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
-            self.assertEqual(32, targets["input_ids"].shape[1])
-
-    @require_torch
-    def test_prepare_batch_not_longer_than_maxlen(self):
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            batch = tokenizer(
-                ["I am a small frog" * 1024, "I am a small frog"], padding=True, truncation=True, return_tensors="pt"
-            )
-            self.assertIsInstance(batch, BatchEncoding)
-            self.assertEqual(batch.input_ids.shape, (2, 5122))
-
-    @require_torch
-    def test_special_tokens(self):
-        src_text = ["A long paragraph for summarization."]
-        tgt_text = [
-            "Summary of the text.",
-        ]
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            inputs = tokenizer(src_text, return_tensors="pt")
-            targets = tokenizer(text_target=tgt_text, return_tensors="pt")
-            input_ids = inputs["input_ids"]
-            labels = targets["input_ids"]
-            self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
-            self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
-            self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
-            self.assertTrue((labels[:, -1] == tokenizer.eos_token_id).all().item())
-
-    @require_torch
-    def test_global_attention_mask(self):
-        for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            src_text = ["Summary of the text.", "Another summary."]
-            expected_global_attention_mask = [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, -1, -1]]
-
-            encoded_output = tokenizer(src_text, padding=False)
-            encoded_output["global_attention_mask"] = [[0] * len(x) for x in encoded_output["input_ids"]]
-            outputs = tokenizer.pad(encoded_output)
-            self.assertSequenceEqual(outputs["global_attention_mask"], expected_global_attention_mask)
-
-    @unittest.skip
-    def test_pretokenized_inputs(self):
-        pass
-
-    def test_embedded_special_tokens(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-                sentence = "A,  AllenNLP sentence."
-                tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-                tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
-                self.assertEqual(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
-                self.assertEqual(
-                    sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
-                    sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
-                )
-
-                tokens_r_str = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
-                tokens_p_str = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
-                self.assertSequenceEqual(tokens_p["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-                self.assertSequenceEqual(tokens_r["input_ids"], [0, 250, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
-
-                self.assertSequenceEqual(
-                    tokens_p_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""]
-                )
-                self.assertSequenceEqual(
-                    tokens_r_str, ["", "A", ",", "", "ĠAllen", "N", "LP", "Ġsentence", ".", ""]
-                )
diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py
index 0a5bbcf2edf9..360901205627 100644
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -1,52 +1,12 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import tempfile
 import unittest
 
-from datasets import load_dataset
-from huggingface_hub import hf_hub_download
-
-from transformers import (
-    SPIECE_UNDERLINE,
-    AddedToken,
-    AutoTokenizer,
-    LlamaTokenizer,
-    LlamaTokenizerFast,
-    PreTrainedTokenizerFast,
-)
-from transformers.convert_slow_tokenizer import convert_slow_tokenizer
+from tests.test_tokenization_common import TokenizerTesterMixin
+from transformers.models.llama.tokenization_llama import LlamaTokenizer
 from transformers.testing_utils import (
-    get_tests_dir,
-    nested_simplify,
-    require_read_token,
-    require_sentencepiece,
-    require_tiktoken,
     require_tokenizers,
-    require_torch,
-    slow,
 )
 
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
 
-
-@require_sentencepiece
 @require_tokenizers
 class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
     from_pretrained_id = [
@@ -55,826 +15,23 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         "meta-llama/Meta-Llama-3-8B",
     ]
     tokenizer_class = LlamaTokenizer
-    rust_tokenizer_class = LlamaTokenizerFast
-
-    test_rust_tokenizer = False
-    test_sentencepiece = True
     from_pretrained_kwargs = {}
 
+    # Integration test data - expected outputs for the default input string
+    integration_expected_tokens = ["▁This", "▁is", "▁a", "▁test", "▁", "<0xF0>", "<0x9F>", "<0x98>", "<0x8A>", "<0x0A>", "I", "▁was", "▁born", "▁in", "▁", "9", "2", "0", "0", "0", ",", "▁and", "▁this", "▁is", "▁f", "als", "é", ".", "<0x0A>", "生", "活", "的", "真", "<0xE8>", "<0xB0>", "<0x9B>", "是", "<0x0A>", "Hi", "▁", "▁Hello", "<0x0A>", "Hi", "▁▁", "▁Hello", "<0x0A>", "<0x0A>", "▁", "<0x0A>", "▁▁", "<0x0A>", "▁Hello", "<0x0A>", "", "<0x0A>", "hi", "", "there", "<0x0A>", "The", "▁following", "▁string", "▁should", "▁be", "▁properly", "▁encoded", ":", "▁Hello", ".", "<0x0A>", "But", "▁", "ird", "▁and", "▁", "ป", "ี", "▁▁▁", "ird", "▁▁▁", "ด", "<0x0A>", "H", "ey", "▁how", "▁are", "▁you", "▁doing"]  # fmt: skip
+    integration_expected_token_ids = [910, 338, 263, 1243, 29871, 243, 162, 155, 141, 13, 29902, 471, 6345, 297, 29871, 29929, 29906, 29900, 29900, 29900, 29892, 322, 445, 338, 285, 1338, 29948, 29889, 13, 30486, 31704, 30210, 30848, 235, 179, 158, 30392, 13, 18567, 29871, 15043, 13, 18567, 259, 15043, 13, 13, 29871, 13, 259, 13, 15043, 13, 1, 13, 2918, 1, 12711, 13, 1576, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889, 13, 6246, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718, 13, 29950, 1032, 920, 526, 366, 2599]  # fmt: skip
+    integration_expected_decoded_text = "This is a test 😊\nI was born in 92000, and this is falsé.\n生活的真谛是\nHi  Hello\nHi   Hello\n\n \n  \n Hello\n\nhithere\nThe following string should be properly encoded: Hello.\nBut ird and ปี   ird   ด\nHey how are you doing"
+
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
 
-        # We have a SentencePiece fixture for testing
-        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        from_pretrained_id = "hf-internal-testing/llama-tokenizer"
+
+        tokenizer = LlamaTokenizer.from_pretrained(from_pretrained_id)
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.save_pretrained(cls.tmpdirname)
 
     def get_tokenizers(self, **kwargs):
-        kwargs.update({"pad_token": ""})
+        kwargs.setdefault("pad_token", "")
         return super().get_tokenizers(**kwargs)
-
-    def test_full_tokenizer(self):
-        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
-
-        tokens = tokenizer.tokenize("This is a test")
-        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
-
-        self.assertListEqual(
-            tokenizer.convert_tokens_to_ids(tokens),
-            [285, 46, 10, 170, 382],
-        )
-
-        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
-        self.assertListEqual(
-            tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "9",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "é",
-                ".",
-            ],
-        )
-        ids = tokenizer.convert_tokens_to_ids(tokens)
-        self.assertListEqual(
-            ids,
-            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
-        )
-
-        back_tokens = tokenizer.convert_ids_to_tokens(ids)
-        self.assertListEqual(
-            back_tokens,
-            [
-                SPIECE_UNDERLINE + "I",
-                SPIECE_UNDERLINE + "was",
-                SPIECE_UNDERLINE + "b",
-                "or",
-                "n",
-                SPIECE_UNDERLINE + "in",
-                SPIECE_UNDERLINE + "",
-                "",
-                "2",
-                "0",
-                "0",
-                "0",
-                ",",
-                SPIECE_UNDERLINE + "and",
-                SPIECE_UNDERLINE + "this",
-                SPIECE_UNDERLINE + "is",
-                SPIECE_UNDERLINE + "f",
-                "al",
-                "s",
-                "",
-                ".",
-            ],
-        )
-
-    @unittest.skip(reason="Let's wait for the fast tokenizer!")
-    def test_save_pretrained(self):
-        self.tokenizers_list += (self.rust_tokenizer_class, "hf-internal-testing/llama-tokenizer", {})
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                tokenizer_r = self.get_rust_tokenizer(pretrained_name, **kwargs)
-                tokenizer_p = self.get_tokenizer(pretrained_name, **kwargs)
-
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files + the tokenizer.json file for the fast one
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-                tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=True
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it save with the same files
-                self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-                # Save tokenizer rust, legacy_format=False
-                tmpdirname2 = tempfile.mkdtemp()
-
-                tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
-                tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
-
-                # Checks it saved the tokenizer.json file
-                self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
-
-                # Checks everything loads correctly in the same way
-                tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
-                tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
-
-                # Check special tokens are set accordingly on Rust and Python
-                for key in tokenizer_pp.special_tokens_map:
-                    self.assertTrue(hasattr(tokenizer_rp, key))
-
-                shutil.rmtree(tmpdirname2)
-
-    @require_torch
-    def test_batch_tokenization(self):
-        if not self.test_seq2seq:
-            self.skipTest(reason="test_seq2seq is set to False")
-
-        tokenizers = self.get_tokenizers()
-        for tokenizer in tokenizers:
-            with self.subTest(f"{tokenizer.__class__.__name__}"):
-                # Longer text that will definitely require truncation.
-                text = [
-                    " UN Chief Says There Is No Military Solution in Syria",
-                    " Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for"
-                    " Syria is that 'there is no military solution' to the nearly five-year conflict and more weapons"
-                    " will only worsen the violence and misery for millions of people.",
-                ]
-                try:
-                    batch = tokenizer(
-                        text=text,
-                        max_length=3,
-                        return_tensors="pt",
-                    )
-                except NotImplementedError:
-                    self.skipTest(reason="Encountered NotImplementedError when calling tokenizer")
-                self.assertEqual(batch.input_ids.shape[1], 3)
-                # max_target_length will default to max_length if not specified
-                batch = tokenizer(text, max_length=3, return_tensors="pt")
-                self.assertEqual(batch.input_ids.shape[1], 3)
-
-                batch_encoder_only = tokenizer(text=text, max_length=3, return_tensors="pt")
-                self.assertEqual(batch_encoder_only.input_ids.shape[1], 3)
-                self.assertEqual(batch_encoder_only.attention_mask.shape[1], 3)
-                self.assertNotIn("decoder_input_ids", batch_encoder_only)
-
-    @unittest.skip(reason="Unfortunately way too slow to build a BPE with SentencePiece.")
-    def test_save_slow_from_fast_and_reload_fast(self):
-        pass
-
-    def test_special_tokens_initialization(self):
-        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
-            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
-                added_tokens = [AddedToken("", lstrip=True)]
-
-                tokenizer_r = self.get_rust_tokenizer(
-                    pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                )
-                r_output = tokenizer_r.encode("Hey this is a  token")
-
-                special_token_id = tokenizer_r.encode("", add_special_tokens=False)[0]
-
-                self.assertTrue(special_token_id in r_output)
-
-                if self.test_slow_tokenizer:
-                    tokenizer_cr = self.get_rust_tokenizer(
-                        pretrained_name,
-                        additional_special_tokens=added_tokens,
-                        **kwargs,  # , from_slow=True <- unfortunately too slow to convert
-                    )
-                    tokenizer_p = self.tokenizer_class.from_pretrained(
-                        pretrained_name, additional_special_tokens=added_tokens, **kwargs
-                    )
-
-                    p_output = tokenizer_p.encode("Hey this is a  token")
-
-                    cr_output = tokenizer_cr.encode("Hey this is a  token")
-
-                    self.assertEqual(p_output, r_output)
-                    self.assertEqual(cr_output, r_output)
-                    self.assertTrue(special_token_id in p_output)
-                    self.assertTrue(special_token_id in cr_output)
-
-    @slow
-    def test_tokenizer_integration(self):
-        expected_encoding = {'input_ids': [[1, 4103, 689, 414, 313, 24784, 368, 2998, 408, 282, 3637, 25350, 29899, 9067, 414, 322, 282, 3637, 25350, 29899, 1457, 3018, 1312, 29899, 2151, 29897, 8128, 2498, 29899, 15503, 4220, 6956, 1973, 313, 13635, 29911, 29892, 402, 7982, 29899, 29906, 29892, 1528, 13635, 29911, 29874, 29892, 1060, 26369, 29892, 6652, 309, 29933, 814, 29892, 1060, 29931, 6779, 11410, 363, 18385, 17088, 7634, 11235, 313, 25103, 29965, 29897, 322, 18385, 17088, 28203, 313, 25103, 29954, 29897, 411, 975, 29871, 29941, 29906, 29974, 758, 3018, 1312, 4733, 297, 29871, 29896, 29900, 29900, 29974, 10276, 322, 6483, 1006, 3372, 3097, 1546, 435, 1165, 29892, 10772, 29911, 25350, 322, 323, 6073, 17907, 29889], [1, 350, 20161, 338, 8688, 304, 758, 29899, 14968, 6483, 21000, 8684, 284, 22540, 515, 443, 29880, 24025, 1426, 491, 14002, 368, 4195, 292, 373, 1716, 2175, 322, 1492, 3030, 297, 599, 15359, 29889], [1, 450, 4996, 17354, 1701, 29916, 432, 17204, 975, 278, 17366, 11203, 29889]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}  # fmt: skip
-
-        self.tokenizer_integration_test_util(
-            expected_encoding=expected_encoding,
-            model_name="hf-internal-testing/llama-tokenizer",
-            revision="0984d03108b1a041ed679bd253b6519b7e1a4778",
-            padding=False,
-        )
-
-    @unittest.skip(reason="worker 'gw4' crashed on CI, passing locally.")
-    def test_subword_regularization_tokenizer(self):
-        pass
-
-    def test_add_prefix_space(self):
-        pretrained_name = "hf-internal-testing/llama-tokenizer-non-normalized"
-        inputs = "Hey how are you doing"
-        EXPECTED_WITH_SPACE = [1, 18637, 920, 526, 366, 2599]
-        EXPECTED_WO_SPACE = [1, 29950, 1032, 920, 526, 366, 2599]
-
-        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
-        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=False, legacy=False)
-        self.assertEqual(slow_.encode(inputs), EXPECTED_WO_SPACE)
-        self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
-        self.assertEqual(slow_.tokenize(inputs), ["H", "ey", "▁how", "▁are", "▁you", "▁doing"])
-        self.assertEqual(slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True), inputs)
-        self.assertEqual(
-            slow_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
-            fast_.decode(EXPECTED_WO_SPACE, skip_special_tokens=True),
-        )
-
-        slow_ = self.get_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
-        fast_ = self.get_rust_tokenizer(pretrained_name, add_prefix_space=True, legacy=False)
-        self.assertEqual(slow_.encode(inputs), EXPECTED_WITH_SPACE)
-        self.assertEqual(slow_.encode(inputs), fast_.encode(inputs))
-        self.assertEqual(slow_.tokenize(inputs), ["▁Hey", "▁how", "▁are", "▁you", "▁doing"])
-        self.assertEqual(slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True), inputs)
-        self.assertEqual(
-            slow_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
-            fast_.decode(EXPECTED_WITH_SPACE, skip_special_tokens=True),
-        )
-
-    def test_load_tokenizer_with_model_file_only(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model", local_dir=tmp_dir)
-            tokenizer_fast = self.rust_tokenizer_class.from_pretrained(tmp_dir)
-            self.assertEqual(tokenizer_fast.encode("This is a test"), [1, 910, 338, 263, 1243])
-
-            tokenizer_slow = self.tokenizer_class.from_pretrained(tmp_dir)
-            self.assertEqual(tokenizer_slow.encode("This is a test"), [1, 910, 338, 263, 1243])
-
-
-@require_torch
-@require_sentencepiece
-@require_tokenizers
-class LlamaIntegrationTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        checkpoint_name = "hf-internal-testing/llama-tokenizer-non-normalized"
-        cls.tokenizer: LlamaTokenizer = LlamaTokenizer.from_pretrained(checkpoint_name)
-        cls.rust_tokenizer = LlamaTokenizerFast.from_pretrained(checkpoint_name)
-        return cls
-
-    @require_torch
-    def integration_tests(self):
-        inputs = self.tokenizer(
-            ["The following string should be properly encoded: Hello.", "But ird and ปี   ird   ด"],
-            return_tensors="pt",
-        )
-
-        self.assertEqual(
-            nested_simplify(inputs),
-            {
-                "input_ids": [
-                    [1, 450, 1494, 1347, 881, 367, 6284, 18511, 29901, 15043, 29889],
-                    [1, 1205, 29871, 1823, 322, 29871, 31010, 30691, 1678, 1823, 1678, 30718],
-                ],
-                "attention_mask": [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
-            },
-        )
-
-    def test_fast_special_tokens(self):
-        slow_tokenizer = self.tokenizer
-        fast_tokenizer = self.rust_tokenizer
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [1, 319, 4559, 1243]
-
-        fast_tokenizer.add_eos_token = False
-        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert fast == [1, 319, 4559, 1243]
-
-        fast_tokenizer.add_eos_token = True
-        print(fast_tokenizer.add_eos_token)
-        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert fast == [1, 319, 4559, 1243, 2]
-
-        slow_tokenizer.add_eos_token = True
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [1, 319, 4559, 1243, 2]
-
-        fast_tokenizer = LlamaTokenizerFast.from_pretrained(
-            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
-        )
-        fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert fast == [319, 4559, 1243, 2]
-
-        slow_tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
-        )
-        slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
-        assert slow == [319, 4559, 1243, 2]
-
-        self.tokenizer.add_eos_token = False
-        self.rust_tokenizer.add_eos_token = False
-
-    # See internal discussion: https://huggingface.slack.com/archives/C01NE71C4F7/p1750680376085749?thread_ts=1750676268.233309&cid=C01NE71C4F7
-    @unittest.skip("failing, won't fix")
-    @slow
-    def test_conversion(self):
-        # This is excruciatingly slow since it has to recreate the entire merge
-        # list from the original vocabulary in spm
-        self.rust_tokenizer.save_pretrained("./out")
-        with tempfile.TemporaryDirectory() as dirname:
-            self.rust_tokenizer.save_pretrained(dirname)
-
-            with open(os.path.join(dirname, "tokenizer.json")) as f:
-                old_serialized = f.read()
-
-        new_tokenizer = convert_slow_tokenizer(self.tokenizer)
-        with tempfile.NamedTemporaryFile() as f:
-            new_tokenizer.save(f.name)
-            # Re-opening since `f` is in bytes.
-            new_serialized = open(f.name).read()
-            with open("out_tokenizer.json", "w") as g:
-                g.write(new_serialized)
-
-            self.assertEqual(old_serialized, new_serialized)
-
-    def test_simple_encode_decode(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        self.assertEqual(pyth_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
-        self.assertEqual(rust_tokenizer.encode("This is a test"), [1, 910, 338, 263, 1243])
-        self.assertEqual(pyth_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
-        self.assertEqual(rust_tokenizer.decode([1, 910, 338, 263, 1243], skip_special_tokens=True), "This is a test")
-
-        # bytefallback showcase
-        self.assertEqual(pyth_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])  # fmt: skip
-        self.assertEqual(rust_tokenizer.encode("生活的真谛是"), [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392])  # fmt: skip
-        self.assertEqual(
-            pyth_tokenizer.decode(
-                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
-            ),
-            "生活的真谛是",
-        )
-        self.assertEqual(
-            rust_tokenizer.decode(
-                [1, 29871, 30486, 31704, 30210, 30848, 235, 179, 158, 30392], skip_special_tokens=True
-            ),
-            "生活的真谛是",
-        )
-
-        # Inner spaces showcase
-        self.assertEqual(pyth_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
-        self.assertEqual(rust_tokenizer.encode("Hi  Hello"), [1, 6324, 29871, 15043])
-        self.assertEqual(pyth_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
-        self.assertEqual(rust_tokenizer.decode([1, 6324, 29871, 15043], skip_special_tokens=True), "Hi  Hello")
-
-        self.assertEqual(pyth_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
-        self.assertEqual(rust_tokenizer.encode("Hi   Hello"), [1, 6324, 259, 15043])
-        self.assertEqual(pyth_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
-        self.assertEqual(rust_tokenizer.decode([1, 6324, 259, 15043], skip_special_tokens=True), "Hi   Hello")
-
-        self.assertEqual(pyth_tokenizer.encode(""), [1])
-        self.assertEqual(rust_tokenizer.encode(""), [1])
-
-        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
-        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
-
-        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
-        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
-
-        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
-        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
-
-    def test_no_differences_showcase(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-        self.assertEqual(pyth_tokenizer.encode(""), [1])
-        self.assertEqual(rust_tokenizer.encode(""), [1])
-
-        self.assertEqual(pyth_tokenizer.encode(" "), [1, 259])
-        self.assertEqual(rust_tokenizer.encode(" "), [1, 259])
-
-        self.assertEqual(pyth_tokenizer.encode("  "), [1, 1678])
-        self.assertEqual(rust_tokenizer.encode("  "), [1, 1678])
-
-        self.assertEqual(pyth_tokenizer.encode(" Hello"), [1, 29871, 15043])
-        self.assertEqual(rust_tokenizer.encode(" Hello"), [1, 29871, 15043])
-
-        self.assertEqual(pyth_tokenizer.encode(""), [1, 1])
-        self.assertEqual(rust_tokenizer.encode(""), [1, 1])
-
-    def test_no_differences_decode(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        self.assertEqual(pyth_tokenizer.decode([869]), ".")
-        self.assertEqual(rust_tokenizer.decode([869]), ".")
-
-        self.assertEqual(pyth_tokenizer.decode([30112, 869]), "ا .")
-        self.assertEqual(rust_tokenizer.decode([30112, 869]), "ا .")
-
-    def test_no_differences_special_tokens(self):
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-        self.assertEqual(pyth_tokenizer.encode(""), [1])
-        self.assertEqual(rust_tokenizer.encode(""), [1])
-
-        self.assertEqual(pyth_tokenizer.encode(""), [1, 1])
-        self.assertEqual(rust_tokenizer.encode(""), [1, 1])
-
-    @unittest.skipIf(
-        os.getenv("RUN_TOKENIZER_INTEGRATION", "0") == "0",
-        "RUN_TOKENIZER_INTEGRATION=1 to run tokenizer integration tests",
-    )
-    def test_integration_test_xnli(self):
-        import tqdm
-
-        pyth_tokenizer = self.tokenizer
-        rust_tokenizer = self.rust_tokenizer
-
-        dataset = load_dataset("google/code_x_glue_ct_code_to_text", "go")
-        for item in tqdm.tqdm(dataset["validation"]):
-            string = item["code"]
-            encoded1 = pyth_tokenizer.encode(string)
-            encoded2 = rust_tokenizer.encode(string)
-
-            self.assertEqual(encoded1, encoded2)
-
-            decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
-            decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
-
-            self.assertEqual(decoded1, decoded2)
-
-        dataset = load_dataset("facebook/xnli", "all_languages")
-
-        for item in tqdm.tqdm(dataset["train"]):
-            for string in item["premise"].values():
-                encoded1 = pyth_tokenizer.encode(string)
-                encoded2 = rust_tokenizer.encode(string)
-
-                self.assertEqual(encoded1, encoded2)
-
-                decoded1 = pyth_tokenizer.decode(encoded1, skip_special_tokens=True)
-                decoded2 = rust_tokenizer.decode(encoded2, skip_special_tokens=True)
-
-                self.assertEqual(decoded1, decoded2)
-
-    def test_special_token_special_word(self):
-        # the word inform should be split as ['in', 'form']
-        tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", legacy=False, from_slow=True)
-        tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
-
-        example_inputs = tokenizer.tokenize("inform. Hey.       .")
-        self.assertEqual(example_inputs, ["", "in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
-
-        # Make sure dummy space is added if it is indeed the first word
-        example_inputs = tokenizer.tokenize("inform. Hey.       .")
-        self.assertEqual(example_inputs, ["▁inform", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
-        out1 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        self.assertEqual(out1, "inform")
-        out2 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
-        )
-        # decoding strips the added prefix space.
-        self.assertEqual(out2, "inform")
-        input_ids = tokenizer.encode("inform", add_special_tokens=False)
-        self.assertEqual(input_ids, [32000, 262, 689])  # 29871 is the spiece underline, '▁' added as it should
-
-        out2 = tokenizer.decode(
-            tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        # TODO @ArthurZ currently we strip left and right, so this will not keep the spaces
-        self.assertEqual(out2, "inform")
-
-        ### Let's make sure decoding does not add extra spaces here and there
-        # TODO @ArthurZ this should be affected by the lstrip/rstrip/single word /normalize refactoring
-        # Since currently we always strip left and right of the token, results are as such
-        input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False)
-        self.assertEqual(input_ids, [1, 15043, 1, 3525])
-        tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False)
-        self.assertEqual(tokens, ["", "▁Hello", "", "how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " Hellohow")
-
-        # Let's make sure that if there are any spaces, we don't remove them!
-        input_ids = tokenizer.encode("  Hello how", add_special_tokens=False)
-        self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
-        tokens = tokenizer.tokenize("  Hello how", add_special_tokens=False)
-        self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " Hello how")
-
-        # Let's make sure the space is preserved
-        input_ids = tokenizer.encode("hello", add_special_tokens=True)
-        self.assertEqual(input_ids, [1, 22172])
-        tokens = tokenizer.tokenize("hello")
-        self.assertEqual(tokens, ["▁hello"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " hello")
-
-        input_ids = tokenizer.encode("hello", add_special_tokens=False)
-        self.assertEqual(input_ids, [22172])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "hello")
-
-    def test_no_prefix_space(self):
-        tokenizer_no_prefix_space = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", add_prefix_space=False)
-        no_prefix_space_tokens = tokenizer_no_prefix_space.tokenize("Hey")
-        self.assertEqual(no_prefix_space_tokens, ["H", "ey"])
-
-        tokenizer = LlamaTokenizerFast.from_pretrained(
-            "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False
-        )
-        tokenizer.add_tokens([AddedToken("", rstrip=True, lstrip=True)], special_tokens=False)
-
-        example_inputs = tokenizer.tokenize("inform. Hey.       .")
-        self.assertEqual(example_inputs, ["", "in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
-
-        # Make sure dummy space is added if it is indeed the first word
-        example_inputs = tokenizer.tokenize("inform. Hey.       .")
-        self.assertEqual(example_inputs, ["in", "form", "", ".", "▁Hey", ".", "▁▁▁▁▁▁", "▁."])
-        out1 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        self.assertEqual(out1, "inform")
-        out2 = tokenizer.decode(
-            tokenizer.encode("inform", add_special_tokens=False), spaces_between_special_tokens=True
-        )
-        # decoding strips the added prefix space.
-        self.assertEqual(out2, "inform")
-        input_ids = tokenizer.encode("inform", add_special_tokens=False)
-        self.assertEqual(input_ids, [32000, 262, 689])  # 29871 is the spiece underline, '▁' added as it should
-
-        out2 = tokenizer.decode(
-            tokenizer.encode(" inform", add_special_tokens=False), spaces_between_special_tokens=False
-        )
-        self.assertEqual(out2, "inform")
-
-        input_ids = tokenizer.encode(" Hellohow", add_special_tokens=False)
-        self.assertEqual(input_ids, [1, 15043, 1, 3525])
-        tokens = tokenizer.tokenize(" Hellohow", add_special_tokens=False)
-        self.assertEqual(tokens, ["", "▁Hello", "", "how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, " Hellohow")
-
-        # Let's make sure that if there are any spaces, we don't remove them!
-        input_ids = tokenizer.encode("  Hello how", add_special_tokens=False)
-        self.assertEqual(input_ids, [29871, 1, 15043, 1, 920])
-        tokens = tokenizer.tokenize("  Hello how", add_special_tokens=False)
-        self.assertEqual(tokens, ["▁", "", "▁Hello", "", "▁how"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "  Hello how")
-
-        # Let's make sure the space is preserved
-        input_ids = tokenizer.encode("hello", add_special_tokens=True)
-        self.assertEqual(input_ids, [1, 12199])
-        tokens = tokenizer.tokenize("hello")
-        self.assertEqual(tokens, ["hello"])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "hello")
-
-        input_ids = tokenizer.encode("hello", add_special_tokens=False)
-        self.assertEqual(input_ids, [12199])
-        decoded_tokens = tokenizer.decode(input_ids)
-        self.assertEqual(decoded_tokens, "hello")
-
-    def test_some_edge_cases(self):
-        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
-
-        sp_tokens = tokenizer.sp_model.encode(">", out_type=str)
-        self.assertEqual(sp_tokens, ["<", "s", ">>"])
-        tokens = tokenizer.tokenize(">")
-        self.assertNotEqual(sp_tokens, tokens)
-        self.assertEqual(tokens, ["", ">"])
-
-        tokens = tokenizer.tokenize("")
-        self.assertEqual(tokens, [])
-        self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
-
-        tokens = tokenizer.tokenize(" ")
-        self.assertEqual(tokens, ["▁▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode("  ", out_type=str))
-
-        tokens = tokenizer.tokenize("▁")
-        self.assertEqual(tokens, ["▁▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
-
-        tokens = tokenizer.tokenize(" ▁")
-        self.assertEqual(tokens, ["▁▁▁"])
-        # a dummy prefix space is not added by the sp_model as it was de-activated
-        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))
-
-    def test_fast_post_processor(self):
-        tokenizer = LlamaTokenizerFast(
-            SAMPLE_VOCAB, eos_token=None, bos_token=None, add_bos_token=False, add_eos_token=False
-        )
-        tokenizer.encode(" Hey ")
-
-        with self.assertRaises(ValueError):
-            tokenizer = LlamaTokenizerFast(
-                SAMPLE_VOCAB, bos_token=None, eos_token="", add_bos_token=True, add_eos_token=False
-            )
-        with self.assertRaises(ValueError):
-            tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True)
-
-
-@require_sentencepiece
-@require_tokenizers
-class CommonSpmIntegrationTests(unittest.TestCase):
-    """
-    A class that regroups important test to make sure that we properly handle the special tokens.
-    """
-
-    @classmethod
-    def setUpClass(cls):
-        tokenizer = LlamaTokenizer(SAMPLE_VOCAB, extra_ids=0, add_bos_token=False, legacy=False)
-        tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("", rstrip=False, lstrip=False)]})
-        cls.tokenizer = tokenizer
-        return cls
-
-    def test_add_dummy_prefix(self):
-        # make sure `'▁'` is prepended, and outputs match sp_model's
-        # `sentencepiece.NormalizerSpec.add_dummy_prefix` attribute
-        input_ids = self.tokenizer.encode(". Hello")
-        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
-        sp_encode = self.tokenizer.sp_model.encode(". Hello")
-        self.assertEqual(input_ids, [7] + sp_encode)
-        tokens = self.tokenizer.tokenize(". Hello")
-        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
-
-        tokens = self.tokenizer.tokenize("")
-        self.assertEqual(tokens, [])
-        self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
-
-        tokens = self.tokenizer.tokenize(" ")
-        self.assertEqual(tokens, [])
-        self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
-
-        tokens = self.tokenizer.tokenize("▁")
-        self.assertEqual(tokens, [])
-        self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
-
-    def test_remove_extra_whitespaces(self):
-        # make sure the extra spaces are eaten. Since the sample vocab does not have
-        # `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
-
-        input_ids = self.tokenizer.encode("       . Hello")
-        self.assertEqual(input_ids, [7, 4, 156, 86, 20])
-        sp_encode = self.tokenizer.sp_model.encode("       . Hello")
-        self.assertEqual(input_ids, [7] + sp_encode)
-        tokens = self.tokenizer.tokenize(" . Hello")
-        self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
-
-        # `'▁'` is also a whitespace
-        input_ids = self.tokenizer.encode("▁He is not")
-        self.assertEqual(input_ids, [156, 46, 44])
-        tokens = self.tokenizer.tokenize("▁He is not")
-        sp_encode = [
-            self.tokenizer.sp_model.piece_to_id("▁He"),
-            self.tokenizer.sp_model.piece_to_id("▁is"),
-            self.tokenizer.sp_model.piece_to_id("▁not"),
-        ]
-        self.assertEqual(input_ids, sp_encode)
-        self.assertEqual(tokens, ["▁He", "▁is", "▁not"])  # no extra space added
-
-        input_ids = self.tokenizer.encode("▁He is not             ▁He")
-        self.assertEqual(input_ids, [156, 46, 44, 1, 156])
-        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
-        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "", "▁He"])  # spaces are eaten by spm + our strip
-        # make sure that the output after the extra id is the same as if
-        # extra_id was not there
-        input_ids = self.tokenizer.encode("▁He is not             ▁He")
-        self.assertEqual(input_ids, [156, 46, 44, 156])
-        tokens = self.tokenizer.tokenize("▁He is not              ▁He")
-        self.assertEqual(tokens, ["▁He", "▁is", "▁not", "▁He"])  # spaces are eaten by spm even if not start
-
-    def test_character_after_special_token(self):
-        # Make sure that `tokenizer.tokenize` is similar to
-        # adding the equivalent special token to the vocab
-        input_ids = self.tokenizer.encode("Hey I")
-        self.assertEqual(input_ids, [156, 30, 1, 100])
-        sp_encode = self.tokenizer.sp_model.encode("Hey .I")
-        # the last token should be 100
-        self.assertEqual(input_ids[-1], sp_encode[-1])
-        tokens = self.tokenizer.tokenize("I")
-        self.assertEqual(tokens, ["", "I"])
-
-        input_ids = self.tokenizer.encode("Hello, ,")
-        self.assertEqual(input_ids, [156, 86, 20, 3, 1, 3])
-        tokens = self.tokenizer.tokenize("Hello, ,")
-        self.assertEqual(tokens, ["▁He", "ll", "o", ",", "", ","])
-
-    def test_special_tokens_strip(self):
-        input_ids = self.tokenizer.encode("  ,")
-        self.assertEqual(input_ids, [1, 7, 3])
-        tokens = self.tokenizer.tokenize("  ,")
-        # spaces are eaten by rstrip / lstrip + spm sp_model.encode("  ") = []
-        self.assertEqual(tokens, ["", "▁", ","])
-
-        input_ids = self.tokenizer.encode("No  ▁He")
-        self.assertEqual(input_ids, [284, 1, 156])
-        tokens = self.tokenizer.tokenize("No  ▁He")
-        self.assertEqual(tokens, ["▁No", "", "▁He"])  # spaces are eaten by rstrip / lstrip
-
-
-@require_tiktoken
-@require_read_token
-class TikTokenIntegrationTests(unittest.TestCase):
-    """
-    A class that regroups important test to make sure that we properly handle the special tokens.
-    """
-
-    def test_tiktoken_llama(self):
-        model_path = "hf-internal-testing/llama-3-8b-internal"
-        subfolder = "original"
-        test_text = "This is a test sentence."
-        test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001]
-        num_reserved_special_tokens = 256
-        special_tokens = [
-            "<|begin_of_text|>",
-            "<|end_of_text|>",
-            "<|reserved_special_token_0|>",
-            "<|reserved_special_token_1|>",
-            "<|reserved_special_token_2|>",
-            "<|reserved_special_token_3|>",
-            "<|start_header_id|>",
-            "<|end_header_id|>",
-            "<|reserved_special_token_4|>",
-            "<|eot_id|>",
-            "<|python_tag|>",  # end of turn
-        ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)]
-
-        tiktoken_tokenizer = PreTrainedTokenizerFast.from_pretrained(
-            model_path,
-            subfolder=subfolder,
-            additional_special_tokens=special_tokens,
-            bos_token="<|begin_of_text|>",
-            eos_token="<|end_of_text|>",
-        )
-        tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text)
-        self.assertEqual(tokens[0], "<|begin_of_text|>")
-
-        tiktoken_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            subfolder=subfolder,
-            legacy=False,
-            additional_special_tokens=special_tokens,
-            bos_token="<|begin_of_text|>",
-            eos_token="<|end_of_text|>",
-            add_bos_token=True,
-            add_eos_token=True,
-        )
-        self.assertTrue(isinstance(tiktoken_tokenizer, PreTrainedTokenizerFast))
-
-        tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
-        self.assertEqual(tokens, test_tokens)
-
-        tmpdirname = tempfile.mkdtemp()
-        tiktoken_tokenizer.save_pretrained(tmpdirname)
-        tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname)
-
-        self.assertTrue(isinstance(tokenizer_reload, PreTrainedTokenizerFast))
-        tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)
-        self.assertEqual(tokens, test_tokens)
-        shutil.rmtree(tmpdirname)
-
-        tiktoken_tokenizer = AutoTokenizer.from_pretrained(
-            model_path,
-            subfolder=subfolder,
-            additional_special_tokens=special_tokens,
-            bos_token="<|begin_of_text|>",
-            eos_token="<|end_of_text|>",
-            from_slow=True,
-            add_bos_token=True,
-            add_eos_token=True,
-        )
-        tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)
-        self.assertEqual(tokens, test_tokens)
diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processing_llava_onevision.py
index 57523170944a..b3dcf04685c9 100644
--- a/tests/models/llava_onevision/test_processing_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processing_llava_onevision.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import json
+import os
 import unittest
 
 import torch
@@ -34,12 +35,60 @@
 class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
     processor_class = LlavaOnevisionProcessor
 
+    @classmethod
+    def setUpClass(cls):
+        # Ensure local assets are used instead of remote URLs to avoid network access in tests
+        from tests.test_processing_common import MODALITY_INPUT_DATA
+        from transformers import video_processing_utils, video_utils
+
+        repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+        local_image = os.path.join(repo_root, "coco_sample.png")
+        if not os.path.isfile(local_image):
+            import numpy as np
+            from PIL import Image
+
+            Image.fromarray((np.random.rand(64, 64, 3) * 255).astype("uint8")).save(local_image)
+
+        local_tiny_video = os.path.join(repo_root, "tiny_video.mp4")
+        if not os.path.isfile(local_tiny_video):
+            try:
+                import torchvision
+
+                frames = (torch.rand(8, 64, 64, 3) * 255).byte()
+                torchvision.io.write_video(local_tiny_video, frames, fps=4)
+            except Exception:
+                local_tiny_video = None
+
+        local_videos = [
+            os.path.join(repo_root, "Big_Buck_Bunny_720_10s_10MB.mp4"),
+            os.path.join(repo_root, "sample_demo_1.mp4"),
+        ]
+        cls.local_tiny_video = local_tiny_video
+        MODALITY_INPUT_DATA["images"] = [local_image, local_image]
+        MODALITY_INPUT_DATA["videos"] = local_videos
+
+        # Force video decoding to use torchvision backend to avoid torchcodec dependency during tests
+        video_processing_utils.is_torchcodec_available = lambda: False  # type: ignore
+        video_utils.is_torchcodec_available = lambda: False  # type: ignore
+        super().setUpClass()
+
     @classmethod
     def _setup_tokenizer(cls):
         tokenizer_class = cls._get_component_class_from_processor("tokenizer")
-        print("tokenizer_class", tokenizer_class)
-        tokenizer = tokenizer_class.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
+        vocab_tokens = [
+            ("", 0.0),
+            ("", 0.0),
+            ("", 0.0),
+            ("[PAD]", 0.0),
+            ("", 0.0),
+            ("