huggingface · ArthurZucker · May 20, 2024 · Apr 29, 2024 · May 3, 2024 · May 6, 2024
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
@@ -592,6 +592,11 @@ def forward(
 
  query_states = self._shape(query_states, tgt_len, bsz)
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
  # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
  # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
  attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -600,8 +605,7 @@ def forward(
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.dropout if self.training else 0.0,
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
- is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
@@ -428,9 +428,11 @@ def forward(
  key_layer = key_layer.contiguous()
  value_layer = value_layer.contiguous()
 
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal
- # mask in case tgt_len == 1.
- is_causal = self.is_decoder and attention_mask is None and tgt_len > 1
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+ # a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_decoder and attention_mask is None and tgt_len > 1 else False
 
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_layer,

diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
@@ -588,8 +588,8 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
- # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
  is_causal = True if causal_mask is None and q_len > 1 else False
 
  attn_output = torch.nn.functional.scaled_dot_product_attention(

diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py
@@ -788,6 +788,11 @@ def forward(
 
  query_states = self._shape(query_states, tgt_len, bsz)
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
  # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
  # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
  attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -796,8 +801,7 @@ def forward(
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.dropout if self.training else 0.0,
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
- is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):

diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
@@ -441,16 +441,19 @@ def forward(
 
  if alibi is None:
  if self._use_sdpa and not output_attentions:
- attn_output = F.scaled_dot_product_attention(
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+ # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+ # create a causal mask in case query_length == 1.
+ is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_layer,
  key_layer,
  value_layer,
- attention_mask,
- 0.0,
- # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
- is_causal=self.is_causal and attention_mask is None and query_length > 1,
+ attn_mask=attention_mask,
+ dropout_p=0.0,
+ is_causal=is_causal,
  )
-
  attention_scores = None
  else:
  attention_scores = query_layer @ key_layer.transpose(-1, -2)
@@ -473,13 +476,16 @@ def forward(
 
  else:
  if self._use_sdpa and not output_attentions and head_mask is None:
- attn_output = F.scaled_dot_product_attention(
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
+ # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_layer,
  key_layer,
  value_layer,
  attn_mask=attention_mask,
  dropout_p=self.attention_dropout.p if self.training else 0.0,
- is_causal=self.is_causal and attention_mask is None and query_length > 1,
+ is_causal=is_causal,
  )
  attn_output = attn_output.transpose(1, 2)
  attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)

diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py
@@ -568,8 +568,8 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
- # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
  is_causal = True if causal_mask is None and q_len > 1 else False
 
  attn_output = torch.nn.functional.scaled_dot_product_attention(

diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -549,14 +549,19 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
  key = key.contiguous()
  value = value.contiguous()
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not
+ # create a causal mask in case query_length == 1.
+ is_causal = True if self.is_causal and attention_mask is None and query_length > 1 else False
+
  sdpa_result = torch.nn.functional.scaled_dot_product_attention(
  query,
  key,
  value,
  attn_mask=attention_mask,
  dropout_p=self.attn_pdrop if self.training else 0.0,
- # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
- is_causal=self.is_causal and attention_mask is None and query_length > 1,
+ is_causal=is_causal,
  scale=scale,
  )
 

diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
@@ -852,6 +852,11 @@ def forward(
 
  query_states = self._shape(query_states, tgt_len, bsz)
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
  # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
  # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
  attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -860,8 +865,7 @@ def forward(
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.dropout if self.training else 0.0,
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
- is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):

diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
@@ -660,14 +660,18 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
- attn_output = nn.functional.scaled_dot_product_attention(
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_states,
  key_states,
  value_states,
  attn_mask=attention_mask,
- dropout_p=self.dropout,
- # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
- is_causal=self.is_causal and attention_mask is None and q_len > 1,
+ dropout_p=self.dropout if self.training else 0.0,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -666,8 +666,8 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
- # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
  is_causal = True if causal_mask is None and q_len > 1 else False
 
  attn_output = torch.nn.functional.scaled_dot_product_attention(

diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
@@ -685,14 +685,18 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_states,
  key_states,
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.attention_dropout if self.training else 0.0,
- # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
- is_causal=self.is_causal and attention_mask is None and q_len > 1,
+ is_causal=is_causal,
  )
 
  attn_output = attn_output.transpose(1, 2).contiguous()

diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -762,14 +762,18 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and q_len > 1 else False
+
  attn_output = torch.nn.functional.scaled_dot_product_attention(
  query_states,
  key_states,
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.attention_dropout if self.training else 0.0,
- # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
- is_causal=self.is_causal and attention_mask is None and q_len > 1,
+ is_causal=is_causal,
  )
 
  attn_output = attn_output.transpose(1, 2).contiguous()

diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -618,6 +618,11 @@ def forward(
 
  query_states = self._shape(query_states, tgt_len, bsz)
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
  # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
  # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
  attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -626,8 +631,7 @@ def forward(
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.dropout if self.training else 0.0,
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
- is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):

diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -634,6 +634,11 @@ def forward(
 
  query_states = self._shape(query_states, tgt_len, bsz)
 
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+ # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+ is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
+
  # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
  # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
  attn_output = torch.nn.functional.scaled_dot_product_attention(
@@ -642,8 +647,7 @@ def forward(
  value_states,
  attn_mask=attention_mask,
  dropout_p=self.dropout if self.training else 0.0,
- # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
- is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+ is_causal=is_causal,
  )
 
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):

diff --git a/src/transformers/models/olmo/modeling_olmo.py b/src/transformers/models/olmo/modeling_olmo.py
@@ -641,8 +641,8 @@ def forward(
  key_states = key_states.contiguous()
  value_states = value_states.contiguous()
 
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this if statement instead of an
- # inline conditional assignment to support both torch.compile's `dynamic=True` and `fullgraph=True`
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+ # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
  is_causal = True if causal_mask is None and q_len > 1 else False
 
  attn_output = torch.nn.functional.scaled_dot_product_attention(