
    qi4                        d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
mZmZ  e       rddlmZ dd	lmZmZmZ  ej&                  e      Z G d
 d      Z	 d dej.                  dej.                  dej.                  dej.                  eej.                  ej.                  f   z  fdZej.                  ez  Z	 	 	 	 	 d!dej.                  dedz  deeef   dz  dedz  ddf
dZdej.                  dedej.                  fdZ	 	 	 d"dej>                  j@                  dej.                  dej.                  dej.                  deej.                  df   de!dz  de!dz  dej.                  dz  deej.                  ej.                  dz  f   fdZ"y)#a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )UnionN)version   )is_torch_flex_attn_availablelogging)get_torch_versionis_torch_less_or_equalis_torchdynamo_compiling)_DEFAULT_SPARSE_BLOCK_SIZE)	BlockMaskcreate_block_maskflex_attentionc                   x     e Zd ZdZdZdZdZ fdZej                  j                  d      d        Zd Z xZS )WrappedFlexAttentionzh
    We are doing a singleton class so that flex attention is compiled once when it's first called.
    NFc                 \    | j                   t        | 	  |       | _         | j                   S N)	_instancesuper__new__)clsargskwargs	__class__s      Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/flex_attention.pyr   zWrappedFlexAttention.__new__7   s'    == !GOC0CM}}    )	recursivec                    | j                   r|| j                  k7  r|| _        t        d      r!t        j                  t
        d      | _        nlt        j                  t                     j                  dk(  r$|r"t        j                  t
        dd      | _        nt        j                  t
              | _        d| _         yy)	z>
        Initialize or update the singleton instance.
        2.5.1F)dynamicz2.6.0zmax-autotune-no-cudagraphs)r   modeTN)_is_flex_compiledtrainingr	   torchcompiler   _compiled_flex_attentionr   parser   base_version)selfr"   s     r   __init__zWrappedFlexAttention.__init__=   s    
 %%T]])B$DM%g.05nV[0\- 023@@GKPX05"E8T1-
 16n0M-%)D" *Cr   c                     | j                   S r   )r%   )r(   s    r   __call__zWrappedFlexAttention.__call__S   s    ,,,r   )__name__
__module____qualname____doc__r   r!   r%   r   r#   compilerdisabler)   r+   __classcell__)r   s   @r   r   r   .   sK     I# ^^e,* -**-r   r   querykeyvaluereturnc                 X    t               s t        |             nt        } || ||fi |S r   )r
   r   r   )r3   r4   r5   r"   r   flex_attention_compileds         r   compile_friendly_flex_attentionr9   W   s@     G_F`<28<>ft" 	 r   attention_mask_2dattention_chunk_sizeoffsets	is_causalr   c                 D     j                   \  }}|s|}|s|}|t        z  dz   t        z  }t        j                  j                  j                   dd||z
  f        j                  }	 j                         |4j                         j                  d      j                  d      dz
  |z   fdfd}
 fd}|s|n|n|
|0|d   j                  |	      |d   j                  |	      fd	}n}t        ||d|||	t        d
             S )aG  
    IMPORTANT NOTICE: This function is deprecated in favor of using the mask primitives in `masking_utils.py`,
    and will be removed in a future version without warnings. New code should not use it. It is only kept here
    for BC for now, while models using it are being patched accordingly.

    Create a block (causal) document mask for a batch of sequences, both packed and unpacked.
    Create Block (causal) logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
    The resultant BlockMask is a compressed representation of the full (causal) block
    mask. BlockMask is essential for performant computation of flex attention.
    See: https://pytorch.org/blog/flexattention/

    Args:
        attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
        of shape (batch_size, total_seq_len). e.g.

        For unpacked sequence:
        [[1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0]]

        For packed sequence:
        [[1, 1, 1, 2, 2, 2, 0],
         [1, 1, 2, 2, 2, 3, 3]]

    Returns:
        BlockMask
       r   )r5   padNc                 T    ||k\  }	| |f   	| |f   k(  }| |f   dkD  }||z  |z  }|S )z
        Defines the logic of a block causal mask by combining both a standard causal mask
        and a block diagonal document mask.
        See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
        for an illustration.
        r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr:   document_idss
           r   causal_mask_modz4make_flex_block_causal_mask.<locals>.causal_mask_mod   sV     vo$Y%56,yRXGX:YY(E)9:Q> </-?
r   c                 B    | |f   | |f   k(  } | |||      }||z  S )zU
        Combines the chunk mask with the causal mask for chunked attention.
        rC   )rD   rE   rF   rG   
chunk_maskcausal_doc_maskrM   
chunk_idxss         r   chunk_causal_mask_modz:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s>      	5 01Z	6@Q5RR
))XufMO++r   c                 D    | |f   | |f   k(  }| |f   dkD  }||z  }|S )zp
        Utilizes default attention mask to enable encoder and encoder-decoder
        attention masks.
        r   rC   )	rD   rE   rF   rG   rI   rJ   rK   r:   rL   s	          r   default_mask_modz5make_flex_block_causal_mask.<locals>.default_mask_mod   sH    
 %Y%56,yRXGX:YY(F):;a?!M1
r   c                 .    |z   }|z   } | |||      S r   rC   )	rD   rE   rF   rG   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_modz-make_flex_block_causal_mask.<locals>.mask_mod   s(    x'H*I*9h)TTr   r   )r[   BHQ_LENKV_LENdevice_compile)shapeflex_default_block_sizer#   nn
functionalr@   r`   clonefill_cumsumtor   r	   )r:   r;   query_length
key_lengthr<   r=   
batch_sizetotal_seq_lenpad_lenr`   rR   rT   r[   rM   rQ   rL   rX   rY   rZ   s   `            @@@@@@r   make_flex_block_causal_maskro   m   sC   D !2 7 7J"
$55:>UUG++//0AQRT[^hThPi/j%%F$**,L'"((*003::2>BH\]
,	 "25I5Q/Wl1:==(AJMM&)		U
 +

+G44	 	r   hidden_statesn_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r?   N)rb   expandreshape)rp   rq   batchnum_key_value_headsslenhead_dims         r   	repeat_kvry      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr   moduleattention_maskscalingsoftcaps_auxc                    |j                  dd      dkD  rt        d      d }	d t        |t              r|}	n|d d d d d d d |j                  d   f   fd}
d}|j                  d   }||dz
  z  dk7  rTt        ||j                  d   |j                  d   z        }t        ||j                  d   |j                  d   z        }d	}|j                  d
      }|j                  j                  dk7  }|s|t        d      t        ||||
|	||||| j                  
      }|r|\  }}|j                  |j                        }||j                  \  }}}}|j                  dddd      j                  |||d      }|j                  d      }t        j                   t        j"                  ||gd      dd      }t        j$                  ||z
        }||z  }n|}d }|j'                  dd      j)                         }||fS )Ndropoutg        r   z`flex_attention` does not support `dropout`. Please use it with inference only (`model.eval()`) or turn off the attention dropout in the respective config.c                 h    t        j                  | z        z  } | |   d   |   |   z   } | S )Nr   )r#   tanh)scorerD   rE   rF   rG   
score_maskr}   s        r   	score_modz)flex_attention_forward.<locals>.score_mod  sK    ejj99E!Jy1!4U;FCCE r   Tr?   Fkernel_optionscpuzhAttention sinks cannot be run on CPU with flex attention. Please switch to a different device, e.g. CUDA)r   
block_mask
enable_gqascaler   
return_lser"   rA   )dim)r   keepdimr   )get
ValueError
isinstancer   rb   ry   r`   typer9   r"   ri   dtypeviewrs   	unsqueezer#   	logsumexpcatexp	transpose
contiguous)rz   r3   r4   r5   r{   r|   r}   r~   r   r   r   r   num_local_query_headsr   r   flex_attention_outputattention_outputlserl   	num_heads	seq_len_q_sinkslse_expandedcombined_lserenorm_factorr   s         `                   @r   flex_attention_forwardr      s;    zz)S!A%a
 	

 JJ.),#
#
1a399R= 89
 J!KKN 	!6!:;AU[[^syy|;<%Q5;;q>!AB
ZZ 01N""e+J%+v
 	
 <%   5#ffU[[!2B2H2H/J	9aJJq"a+22:y)UVWE
 ==,L ??599lE5JPR+SY[eijL "IIl\&ABM/-?0'11!Q7BBDS  r   )F)NNNNT)NNN)#r/   typingr   r#   	packagingr   utilsr   r   utils.import_utilsr   r	   r
   !torch.nn.attention.flex_attentionr   rc   r   r   r   
get_loggerr,   loggerr   Tensortupler9   intOffsetboolro   ry   rd   Modulefloatr   rC   r   r   <module>r      s  8    9 d d  !g^^ 
		H	%&- &-Z 	<<	 << \\E%,,455$ 
	 (,,0!o||o*o
 66>"T)o d{o od	UU\\ 	U# 	U%,, 	U$ ! !%]!HHOO]!<<]! 
]! <<	]!
 %,,34]! T\]! T\]! <<$]! 5<<,,-]!r   