
    qi                        d dl Z ddlmZmZ ddlmZ  ej                  e      Z e       Z	de j                  de j                  j                  de j                  fdZ	 	 	 	 	 dde j                  j                  de j                  d	e j                  d
e j                  de j                  dz  dededz  dedz  dedz  dedz  dee j                  df   fdZy)    N   )_flash_attention_forward!flash_attn_supports_top_left_mask)loggingquerymodulereturnc                 R   | j                   t        j                  k(  rt        j                  d      rt        j                  d      S t        |j                  d      r|j                  j                   S t        d |j                         D              j                  j                   S y)ziIf the query is in float32, return a target dtype compatible with flash attention. Return None otherwise.cuda_is_quantizedc              3   j   K   | ]+  }t        |t        j                  j                        s(| - y w)N)
isinstancetorchnnLinear).0layers     [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/flash_attention.py	<genexpr>z#get_target_dtype.<locals>.<genexpr>   s&     b%z%QVQYQYQ`Q`?abs   )33N)
dtyper   float32is_autocast_enabledget_autocast_dtypehasattrconfignextmodulesweight)r   r   s     r   get_target_dtyper      sv    {{emm#$$V,++F33V]]O4==&&&b6>>+;bbiiooo    keyvalueattention_maskdropoutscalingsliding_windowsoftcap	is_causalc
                    |
j                  dd      rt        j                  d       |j                  d   }t	        d |j                  D              rt        d      |j                  dd      }|j                  dd      }|j                  dd      }t        ||       }|	|	n| j                  }	t        ||||f||	||||t        || j                  j                  t        | d      r| j                  nd d	
|
}|d fS )
Noutput_attentionsFzFlash Attention does not support `output_attentions=True`. Please set your attention to `eager` if you want any of these features.r   c              3   &   K   | ]	  }|d k(    yw)r   N )r   dims     r   r   z*flash_attention_forward.<locals>.<genexpr>/   s     
+3!8
+s   zTensor query has shape  with a zero dimension.
FlashAttention does not support inputs with dim=0.
Please check your input shapes or use SDPA instead.   	layer_idx)
query_lengthr(   r$   softmax_scaler&   r'   use_top_left_masktarget_dtypeattn_implementationr/   )getloggerwarning_onceshapeany
ValueError	transposer   r(   r   _use_top_left_maskr   _attn_implementationr   r/   )r   r   r!   r"   r#   r$   r%   r&   r'   r(   kwargsseq_lenr3   attn_outputs                 r   flash_attention_forwardrA      s    zz%u-W	
 kk!nG

+u{{
++B
 	
 OOAq!E
--1
COOAq!E $E62L '2	8H8HI*	
 %,!"MM>>&-fk&B&"" K$ r    )g        NNNN)r   modeling_flash_attention_utilsr   r   utilsr   
get_logger__name__r6   r<   Tensorr   Moduler   r   floatintbooltuplerA   r,   r    r   <module>rL      s    h  
		H	%68 
ELL 
%((// 
ekk 
&  !% !=HHOO=<<= 
= <<	=
 LL4'= = T\= $J= T\= d{= 5<<=r    