From 2f968ded4c0beee8c8d4b151d0e16a7545732ae8 Mon Sep 17 00:00:00 2001 From: August Moharrami Date: Sun, 7 Sep 2025 12:22:47 +0000 Subject: [PATCH 1/2] removed warning --- src/transformers/models/gemma3/modeling_gemma3.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/transformers/models/gemma3/modeling_gemma3.py b/src/transformers/models/gemma3/modeling_gemma3.py index a7a805ebff50..f8cbfcf1de77 100644 --- a/src/transformers/models/gemma3/modeling_gemma3.py +++ b/src/transformers/models/gemma3/modeling_gemma3.py @@ -654,11 +654,6 @@ def forward( "What is your favorite condiment?" ```""" - if self.training and self.config._attn_implementation != "eager": - logger.warning_once( - "It is strongly recommended to train Gemma3 models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states From 7d7169eb2a49e6ea196b8c0af64c0a0dd9295009 Mon Sep 17 00:00:00 2001 From: August Moharrami Date: Mon, 8 Sep 2025 12:23:20 +0000 Subject: [PATCH 2/2] removed remaining warnings --- src/transformers/models/gemma2/modeling_gemma2.py | 5 ----- src/transformers/models/gemma2/modular_gemma2.py | 5 ----- src/transformers/models/gemma3n/modeling_gemma3n.py | 5 ----- src/transformers/models/t5gemma/modeling_t5gemma.py | 11 +---------- src/transformers/models/t5gemma/modular_t5gemma.py | 10 ---------- 5 files changed, 1 insertion(+), 35 deletions(-) diff --git a/src/transformers/models/gemma2/modeling_gemma2.py b/src/transformers/models/gemma2/modeling_gemma2.py index d4427a08e23e..3d088cfc52cf 100644 --- a/src/transformers/models/gemma2/modeling_gemma2.py +++ b/src/transformers/models/gemma2/modeling_gemma2.py @@ -534,11 +534,6 @@ def forward( "What is your favorite condiment?" ```""" - if self.training and self.config._attn_implementation != "eager": - logger.warning_once( - "It is strongly recommended to train Gemma2 models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 7f101ff1ec0a..2a3e05e4754e 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -520,11 +520,6 @@ def forward( "What is your favorite condiment?" ```""" - if self.training and self.config._attn_implementation != "eager": - logger.warning_once( - "It is strongly recommended to train Gemma2 models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/gemma3n/modeling_gemma3n.py b/src/transformers/models/gemma3n/modeling_gemma3n.py index 737d288f82b5..198ed7c0fcce 100644 --- a/src/transformers/models/gemma3n/modeling_gemma3n.py +++ b/src/transformers/models/gemma3n/modeling_gemma3n.py @@ -1808,11 +1808,6 @@ def forward( "What is your favorite condiment?" ```""" - if self.training and self.config._attn_implementation != "eager": - logger.warning_once( - "It is strongly recommended to train Gemma3n models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states diff --git a/src/transformers/models/t5gemma/modeling_t5gemma.py b/src/transformers/models/t5gemma/modeling_t5gemma.py index 790431caedee..4628614ba363 100644 --- a/src/transformers/models/t5gemma/modeling_t5gemma.py +++ b/src/transformers/models/t5gemma/modeling_t5gemma.py @@ -41,7 +41,7 @@ from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torchdynamo_compiling, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging from ...utils.deprecation import deprecate_kwarg from ...utils.generic import OutputRecorder, check_model_inputs from .configuration_t5gemma import T5GemmaConfig, T5GemmaModuleConfig @@ -1064,15 +1064,6 @@ def forward( config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - if self.training and self.config._attn_implementation != "eager": - msg = ( - "It is strongly recommended to train T5Gemma models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) - if is_torchdynamo_compiling(): - raise ValueError(msg) - else: - logger.warning_once(msg) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 930c353f05a0..924ddaa6871d 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -37,7 +37,6 @@ TransformersKwargs, auto_docstring, can_return_tuple, - is_torchdynamo_compiling, logging, ) from ...utils.deprecation import deprecate_kwarg @@ -921,15 +920,6 @@ def forward( config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. """ - if self.training and self.config._attn_implementation != "eager": - msg = ( - "It is strongly recommended to train T5Gemma models with the `eager` attention implementation " - f"instead of `{self.config._attn_implementation}`. Use `eager` with `AutoModelForCausalLM.from_pretrained('', attn_implementation='eager')`." - ) - if is_torchdynamo_compiling(): - raise ValueError(msg) - else: - logger.warning_once(msg) if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right