Fix Optional type annotation (huggingface#36841)

cyyever · gante · zucchini-nlp · commit b8d7aa6df713 · 2025-05-14T18:43:06.000+02:00
* Fix annotation

* Update src/transformers/generation/candidate_generator.py

Co-authored-by: Joao Gante &lt;joaofranciscocardosogante@gmail.com&gt;

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante &lt;joaofranciscocardosogante@gmail.com&gt;

* Update src/transformers/generation/utils.py

Co-authored-by: Joao Gante &lt;joaofranciscocardosogante@gmail.com&gt;

---------

Co-authored-by: Joao Gante &lt;joaofranciscocardosogante@gmail.com&gt;
diff --git a/src/transformers/agents/agents.py b/src/transformers/agents/agents.py
@@ -217,7 +217,7 @@ def tools(self) -> Dict[str, Tool]:
         """Get all tools currently in the toolbox"""
         return self._tools
 
-    def show_tool_descriptions(self, tool_description_template: str = None) -> str:
+    def show_tool_descriptions(self, tool_description_template: Optional[str] = None) -> str:
         """
         Returns the description of all tools in the toolbox
 
@@ -891,7 +891,7 @@ def direct_run(self, task: str):
 
         return final_answer
 
-    def planning_step(self, task, is_first_step: bool = False, iteration: int = None):
+    def planning_step(self, task, is_first_step: bool = False, iteration: Optional[int] = None):
         """
         Used periodically by the agent to plan the next steps to reach the objective.
 
diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
@@ -1125,7 +1125,7 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
     return frames
 
 
-def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
+def stft(frames: np.array, windowing_function: np.array, fft_window_size: Optional[int] = None):
     """
     Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
     as `torch.stft`.
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1183,8 +1183,8 @@ class StaticCache(Cache):
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
         device: torch.device = None,
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
@@ -1367,8 +1367,8 @@ class SlidingWindowCache(StaticCache):
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
         device: torch.device = None,
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
@@ -1674,8 +1674,8 @@ class HybridCache(Cache):
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
-        max_cache_len: int = None,
+        batch_size: Optional[int] = None,
+        max_cache_len: Optional[int] = None,
         device: Union[torch.device, str] = None,
         dtype: torch.dtype = torch.float32,
         max_batch_size: Optional[int] = None,
@@ -1877,7 +1877,7 @@ class MambaCache:
     def __init__(
         self,
         config: PretrainedConfig,
-        batch_size: int = None,
+        batch_size: Optional[int] = None,
         dtype: torch.dtype = torch.float16,
         device: Optional[Union[torch.device, str]] = None,
         max_batch_size: Optional[int] = None,
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
@@ -16,6 +16,7 @@
 import os
 from functools import partial
 from multiprocessing import Pool, cpu_count
+from typing import Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -800,8 +801,8 @@ def __init__(
         start_position,
         end_position,
         is_impossible,
-        qas_id: str = None,
-        encoding: BatchEncoding = None,
+        qas_id: Optional[str] = None,
+        encoding: Optional[BatchEncoding] = None,
     ):
         self.input_ids = input_ids
         self.attention_mask = attention_mask
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -914,9 +914,9 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
 
     def __init__(
         self,
-        eos_token_id: torch.Tensor = None,
+        eos_token_id: Optional[torch.Tensor] = None,
         num_output_tokens: int = 10,
-        max_matching_ngram_size: int = None,
+        max_matching_ngram_size: Optional[int] = None,
         max_length: int = 20,
     ):
         self.num_output_tokens = num_output_tokens
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
@@ -171,8 +171,8 @@ def _prepare_encoder_decoder_kwargs_for_generation(self, input_ids, params, mode
     def _prepare_decoder_input_ids_for_generation(
         self,
         batch_size: int,
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
         model_kwargs: Optional[Dict[str, jnp.ndarray]] = None,
     ) -> jnp.ndarray:
         if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
@@ -183,7 +183,9 @@ def _prepare_decoder_input_ids_for_generation(
         decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
         return jnp.array(decoder_start_token_id, dtype="i4").reshape(1, -1).repeat(batch_size, axis=0)
 
-    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Optional[int] = None, bos_token_id: Optional[int] = None
+    ) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
         # fall back to bos_token_id if necessary
         decoder_start_token_id = (
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
@@ -1077,8 +1077,8 @@ def _prepare_decoder_input_ids_for_generation(
         batch_size: int,
         model_input_name: str,
         model_kwargs: Dict[str, tf.Tensor],
-        decoder_start_token_id: int = None,
-        bos_token_id: int = None,
+        decoder_start_token_id: Optional[int] = None,
+        bos_token_id: Optional[int] = None,
     ) -> Tuple[tf.Tensor, Dict[str, tf.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
         # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
@@ -1111,7 +1111,9 @@ def _prepare_decoder_input_ids_for_generation(
 
         return decoder_input_ids, model_kwargs
 
-    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: Optional[int] = None, bos_token_id: Optional[int] = None
+    ) -> int:
         # retrieve decoder_start_token_id for encoder-decoder models
         # fall back to bos_token_id if necessary
         decoder_start_token_id = (
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -157,7 +157,7 @@ class GenerateDecoderOnlyOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
@@ -202,7 +202,7 @@ class GenerateEncoderDecoderOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
     encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -247,7 +247,7 @@ class GenerateBeamDecoderOnlyOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     sequences_scores: Optional[torch.FloatTensor] = None
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
@@ -301,7 +301,7 @@ class GenerateBeamEncoderDecoderOutput(ModelOutput):
             the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
     """
 
-    sequences: torch.LongTensor = None
+    sequences: torch.LongTensor
     sequences_scores: Optional[torch.FloatTensor] = None
     scores: Optional[Tuple[torch.FloatTensor]] = None
     logits: Optional[Tuple[torch.FloatTensor]] = None
@@ -699,7 +699,7 @@ def _prepare_decoder_input_ids_for_generation(
         model_input_name: str,
         model_kwargs: Dict[str, torch.Tensor],
         decoder_start_token_id: torch.Tensor,
-        device: torch.device = None,
+        device: Optional[torch.device] = None,
     ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
         """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
         # 1. Check whether the user has defined `decoder_input_ids` manually. To facilitate in terms of input naming,
@@ -923,7 +923,7 @@ def _get_logits_processor(
         encoder_input_ids: torch.LongTensor,
         prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
         logits_processor: Optional[LogitsProcessorList],
-        device: str = None,
+        device: Optional[str] = None,
         model_kwargs: Optional[Dict[str, Any]] = None,
         negative_prompt_ids: Optional[torch.Tensor] = None,
         negative_prompt_attention_mask: Optional[torch.Tensor] = None,
@@ -4833,7 +4833,7 @@ def _ranking_fast(
     return selected_idx
 
 
-def _split(data, full_batch_size: int, split_size: int = None):
+def _split(data, full_batch_size: int, split_size: int):
     """
     Takes care of three cases:
     1. data is a tensor: e.g. last_hidden_state, pooler_output etc. split them on the batch_size dim
diff --git a/src/transformers/generation/watermarking.py b/src/transformers/generation/watermarking.py
@@ -257,7 +257,7 @@ class BayesianDetectorConfig(PretrainedConfig):
             Prior probability P(w) that a text is watermarked.
     """
 
-    def __init__(self, watermarking_depth: int = None, base_rate: float = 0.5, **kwargs):
+    def __init__(self, watermarking_depth: Optional[int] = None, base_rate: float = 0.5, **kwargs):
         self.watermarking_depth = watermarking_depth
         self.base_rate = base_rate
         # These can be set later to store information about this detector.
diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py
@@ -63,11 +63,11 @@ def make_choice_type_function(choices: list) -> Callable[[str], Any]:
 
 def HfArg(
     *,
-    aliases: Union[str, list[str]] = None,
-    help: str = None,
+    aliases: Optional[Union[str, list[str]]] = None,
+    help: Optional[str] = None,
     default: Any = dataclasses.MISSING,
     default_factory: Callable[[], Any] = dataclasses.MISSING,
-    metadata: dict = None,
+    metadata: Optional[dict] = None,
     **kwargs,
 ) -> dataclasses.Field:
     """Argument helper enabling a concise syntax to create dataclass fields for parsing with `HfArgumentParser`.
diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional
 
 from .integrations import (
     is_optuna_available,
@@ -37,7 +38,7 @@
 
 class HyperParamSearchBackendBase:
     name: str
-    pip_package: str = None
+    pip_package: Optional[str] = None
 
     @staticmethod
     def is_available():
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
@@ -1332,12 +1332,12 @@ class SizeDict:
     Hashable dictionary to store image size information.
     """
 
-    height: int = None
-    width: int = None
-    longest_edge: int = None
-    shortest_edge: int = None
-    max_height: int = None
-    max_width: int = None
+    height: Optional[int] = None
+    width: Optional[int] = None
+    longest_edge: Optional[int] = None
+    shortest_edge: Optional[int] = None
+    max_height: Optional[int] = None
+    max_width: Optional[int] = None
 
     def __getitem__(self, key):
         if hasattr(self, key):
diff --git a/src/transformers/loss/loss_utils.py b/src/transformers/loss/loss_utils.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Optional
+
 import torch
 import torch.nn as nn
 from torch.nn import BCEWithLogitsLoss, MSELoss
@@ -22,7 +24,7 @@
 from .loss_rt_detr import RTDetrForObjectDetectionLoss
 
 
-def fixed_cross_entropy(source, target, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs):
+def fixed_cross_entropy(source, target, num_items_in_batch: Optional[int] = None, ignore_index: int = -100, **kwargs):
     reduction = "sum" if num_items_in_batch is not None else "mean"
     loss = nn.functional.cross_entropy(source, target, ignore_index=ignore_index, reduction=reduction)
     if reduction == "sum":
@@ -34,7 +36,7 @@ def ForCausalLMLoss(
     logits,
     labels,
     vocab_size: int,
-    num_items_in_batch: int = None,
+    num_items_in_batch: Optional[int] = None,
     ignore_index: int = -100,
     shift_labels=None,
     **kwargs,
@@ -58,7 +60,7 @@ def ForCausalLMLoss(
 
 
 def ForMaskedLMLoss(
-    logits, labels, vocab_size: int, num_items_in_batch: int = None, ignore_index: int = -100, **kwargs
+    logits, labels, vocab_size: int, num_items_in_batch: Optional[int] = None, ignore_index: int = -100, **kwargs
 ):
     # Upcast to float if we need to compute the loss to avoid potential precision issues
     logits = logits.float()
diff --git a/src/transformers/onnx/features.py b/src/transformers/onnx/features.py
@@ -53,7 +53,7 @@
 
 
 def supported_features_mapping(
-    *supported_features: str, onnx_config_cls: str = None
+    *supported_features: str, onnx_config_cls: Optional[str] = None
 ) -> Dict[str, Callable[[PretrainedConfig], OnnxConfig]]:
     """
     Generate the mapping between supported the features and their corresponding OnnxConfig for a given model.
@@ -626,7 +626,7 @@ def get_model_class_for_feature(feature: str, framework: str = "pt") -> Type:
         return task_to_automodel[task]
 
     @staticmethod
-    def determine_framework(model: str, framework: str = None) -> str:
+    def determine_framework(model: str, framework: Optional[str] = None) -> str:
         """
         Determines the framework to use for the export.
 
@@ -677,7 +677,7 @@ def determine_framework(model: str, framework: str = None) -> str:
 
     @staticmethod
     def get_model_from_feature(
-        feature: str, model: str, framework: str = None, cache_dir: str = None
+        feature: str, model: str, framework: Optional[str] = None, cache_dir: Optional[str] = None
     ) -> Union["PreTrainedModel", "TFPreTrainedModel"]:
         """
         Attempts to retrieve a model from a model's name and the feature to be enabled.
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
@@ -284,7 +284,7 @@ def get_polynomial_decay_schedule_with_warmup(
     return LambdaLR(optimizer, lr_lambda, last_epoch)
 
 
-def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
+def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: Optional[int] = None):
     if current_step < num_warmup_steps:
         return float(current_step) / float(max(1, num_warmup_steps))
     shift = timescale - num_warmup_steps
@@ -293,7 +293,7 @@ def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps:
 
 
 def get_inverse_sqrt_schedule(
-    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
+    optimizer: Optimizer, num_warmup_steps: int, timescale: Optional[int] = None, last_epoch: int = -1
 ):
     """
     Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
@@ -59,7 +59,7 @@ def __init__(
         decay_schedule_fn: Callable,
         warmup_steps: int,
         power: float = 1.0,
-        name: str = None,
+        name: Optional[str] = None,
     ):
         super().__init__()
         self.initial_learning_rate = initial_learning_rate
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -565,7 +565,7 @@ def clean_custom_task(task_info):
 
 
 def pipeline(
-    task: str = None,
+    task: Optional[str] = None,
     model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
     config: Optional[Union[str, PretrainedConfig]] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
@@ -580,7 +580,7 @@ def pipeline(
     device_map=None,
     torch_dtype=None,
     trust_remote_code: Optional[bool] = None,
-    model_kwargs: Dict[str, Any] = None,
+    model_kwargs: Optional[Dict[str, Any]] = None,
     pipeline_class: Optional[Any] = None,
     **kwargs,
 ) -> Pipeline:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -4021,7 +4021,7 @@ def prepare_seq2seq_batch(
         max_length: Optional[int] = None,
         max_target_length: Optional[int] = None,
         padding: str = "longest",
-        return_tensors: str = None,
+        return_tensors: Optional[str] = None,
         truncation: bool = True,
         **kwargs,
     ) -> BatchEncoding:
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py