
    qiO6                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl
mc mZ ddl	mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*  e%jV                  e,      Z-d Z.d Z/d Z0de	jb                  de	jb                  fdZ2e e#d       G d de!                    Z3e e#d       G d de!                    Z4ee# G d d e!                    Z5 G d! d"ejl                        Z7 G d# d$ejl                        Z8 G d% d&ejl                        Z9 G d' d(ejl                        Z: G d) d*ejl                        Z; G d+ d,ejl                        Z< G d- d.ejl                        Z= G d/ d0ejl                        Z> G d1 d2ejl                        Z? G d3 d4e      Z@ G d5 d6ejl                        ZA G d7 d8ejl                        ZB G d9 d:ejl                        ZC G d; d<ejl                        ZD	 dcd=ejl                  d>e	jb                  d?e	jb                  d@e	jb                  dAe	jb                  dz  dBeEdCeEfdDZF G dE dFejl                        ZG G dG dHejl                        ZH G dI dJejl                        ZI G dK dLejl                        ZJ G dM dNejl                        ZK G dO dPe      ZL G dQ dRejl                        ZM G dS dTejl                        ZNe# G dU dVe             ZO G dW dXeO      ZP e#dY       G dZ d[eO             ZQe# G d\ d]eO             ZRe# G d^ d_eO             ZSe# G d` daeO             ZTg dbZUy)dzPyTorch CLAP model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j                   \  }}}| dddddddf   j                  dd|d      }|j                  |||z  |      }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.pyinterpolater'   -   sX     .;-@-@*ZkaD!m,33Aq%CI!!*kE.A;OI    c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r            r   viewpermute
contiguous)r    window_sizer"   heightwidthnum_channelswindowss          r&   window_partitionr7   >   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr(   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r-   r   r   r   r*   r+   r,   r.   )r6   r2   r3   r4   r5   s        r&   window_reverser9   S   sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr(   logitsreturnc                     t        j                  t        |       | j                        }t        j
                  j                  | |      S )Ndevice)torcharangelenr>   r   
functionalcross_entropy)r:   labelss     r&   contrastive_lossrE   h   s1    \\#f+fmm<F==&&vv66r(   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedslast_hidden_state.r    
attentions)__name__
__module____qualname____doc__rI   r?   FloatTensor__annotations__rJ   r    tuplerK    r(   r&   rH   rH   m   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r(   rH   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)ClapAudioModelOutputz
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        The Audio embeddings obtained by applying the projection layer to the pooler_output.
    Naudio_embedsrJ   .r    rK   )rL   rM   rN   rO   rV   r?   rP   rQ   rJ   r    rR   rK   rS   r(   r&   rU   rU      sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r(   rU   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
ClapOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for audio-text similarity.
    logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
        The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
        The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
    audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapTextModel`].
    audio_model_output (`BaseModelOutputWithPooling`):
        The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrI   rV   text_model_outputaudio_model_outputr;   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r\   r]   N)getattrto_tuple).0kselfs     r&   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   s=      
  KKDGQXY]_`QaQjQjQll
s   -0)rR   keysrd   s   `r&   ra   zClapOutput.to_tuple   s#     
YY[
 
 	
r(   )rL   rM   rN   rO   rY   r?   rP   rQ   rZ   r[   rI   rV   r\   r   r]   rR   r   ra   rS   r(   r&   rX   rX      s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148185929
%* 
r(   rX   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    c                 0    t         |           || _        y N)super__init__	drop_prob)rd   rn   	__class__s     r&   rm   zClapDropPath.__init__   s    "r(   c                 J   | j                   dk(  s| j                  s|S d| j                   z
  }|j                  d   fd|j                  dz
  z  z   }|t	        j
                  ||j                  |j                        z   }|j                          |j                  |      |z  }|S )N        r   r   )r   dtyper>   )
rn   trainingr   ndimr?   randrs   r>   floor_div)rd   r    	keep_probr   random_tensoroutputs         r&   forwardzClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJuM<O<OXeXlXl$mm""9-=r(   rk   )rL   rM   rN   rO   rm   r|   __classcell__ro   s   @r&   ri   ri      s    
#r(   ri   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t         |           |j                  }|j                  }t	        ||z        }t        j                  t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _
        t        j                  t        j                  d      t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _        t        j                         | _        y )Nr   r   kernel_sizestridepaddingT)inplace)rl   rm   patch_embeds_hidden_sizeaff_block_rintr   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)rd   r   channelsdownsize_ratiointer_channelsro   s        r&   rm   zClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 zz|r(   c                     ||z   }| j                  |      | j                  |      z   }| j                  |      }d|z  |z  d|z  d|z
  z  z   }|S )Nr*   r   )r   r   r   )rd   r    residualattention_inputfused_layer_outputr{   s         r&   r|   zClapAudioAFFBlock.forward   sb    '(2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar(   rL   rM   rN   rO   r   rm   r|   r}   r~   s   @r&   r   r      s    
$ $0r(   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    t         |           t        |j                  t              r|j                  |j                  fn|j                  }t        |j
                  t              r|j
                  |j
                  fn|j
                  }t        |j                  t              r|j                  |j                  fn|j                  }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j                  d   | j                  d   z  | _	        |j                  | _        |j                  | _        |d   |d   z
  dz  |d   |d   z
  dz  f}| j                  r|j                  dk(  rdnd}t        j                  |j                   |z  |j"                  |||      | _        |j&                  rt        j(                  |j"                        nt        j*                         | _        | j                  rZt/        |      | _        t        j                  |j                   |j"                  |d   |d   dz  f|d   |d   dz  f|      | _        y y )Nr   r   r*   channel_mapr+   r   r   )rl   rm   
isinstance	spec_sizer   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)rd   r   r   r   r   r   scale_factorro   s          r&   rm   zClapAudioPatchEmbed.__init__   s+   ;EfFVFVX[;\F$$f&6&67bhbrbr6@ARARTW6XV 1 12^d^o^o 	 ;EVEXEXZ]:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r(   c                    | j                   r|d d ddd d d d f   }|j                  \  }}}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }|j                  d      }t        |      dkD  r||dd d d d d f   j                         }	|	j                  \  }}}}|	j                  ||z  d||      }	| j                  |	      }	|	j                  \  }
}}}|	j                  |||||      }	|	j                  d      j                         j                  d	      }	|	j                  d      }t        j                  j                  j                  |	d||z
  fd
d      }	| j!                  ||   |	      ||<   |}nx|j                  \  }
}
}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }| j                  r!|j                  d      j#                  dd      }| j%                  |      }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r-   )r   r*   r   r   r+   r   constantr*   )r   r   r   
ValueErrorr   sizerA   r1   r/   r   r0   r   r?   r   rB   padr   	transposer   )rd   r    is_longer_idxglobal_hidden_statesr"   r5   r3   r4   output_widthlocal_hidden_states_featureslocal_widths                r&   r|   zClapAudioPatchEmbed.forward*  s   #0AaCA#>  7K6P6P3Jfeq))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&3M12q!4K&L&W&W&Y#:M:S:S7
L&%&9&>&>zL?XZ[]cej&k#&*oo6I&J#-@-F-F*8VU&9&>&>z<Yacikp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\K-G)H*VW'# 7;6G6G(79L7$]3 1M"/"5"5Aq&%q))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r(   rk   r   r~   s   @r&   r   r      s    
( (T/r(   r   c            
            e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  deej
                     fdZ	d Z
 xZS )
ClapAudioSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        | j#                  d| j%                                t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j&                  | j                  | j                  |j(                        | _        t        j0                  |j2                        | _        y )	Nr   The hidden size (6) is not a multiple of the number of attention heads ()r*   r   relative_position_indexbias)rl   rm   r   num_attention_headsr   attention_head_sizeall_head_sizer   collectionsabcIterabler2   r   	Parameterr?   zerosrelative_position_bias_tableregister_buffercreate_relative_position_indexLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropoutrd   r   dim	num_headsr2   ro   s        r&   rm   zClapAudioSelfAttention.__init__^  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
) 	68[8[8]^YYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr(   Nr    attention_maskoutput_attentionsr;   c                    |j                   \  }}}||d| j                  f}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        j                  ||	j	                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j                         }||j!                  d      z   }|r|j                   d   }|j                  ||z  || j"                  ||      }||j!                  d      j!                  d      z   }|j                  d| j"                  ||      }t$        j&                  j)                  |d      }| j+                  |      }t        j                  ||
      }|j                  dddd      j                         }|j-                         d d | j.                  fz   }|j                  |      }|r||f}|S |f}|S )Nr-   r   r*   r   r   r   )r   r   r   r/   r   r   r   r?   matmulmathsqrtr   r   r2   r0   r1   	unsqueezer   r   rB   softmaxr   r   r   )rd   r    r   r   r"   r   r5   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                     r&   r|   zClapAudioSelfAttention.forwardx  s    )6(;(;%
C"CT-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r(   c                    t        j                  | j                  d         }t        j                  | j                  d         }t        j                  t        j                  ||gd            }t        j
                  |d      }|d d d d d f   |d d d d d f   z
  }|j                  ddd      j                         }|d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   | j                  d   dz
  z  cc<   |d d d d dfxx   d| j                  d   z  dz
  z  cc<   |j                  d      }|S )Nr   r   ij)indexingr*   r-   )	r?   r@   r2   stackmeshgridr   r0   r1   sum)rd   coords_hcoords_wcoordscoords_flattenrelative_coordsr   s          r&   r   z5ClapAudioSelfAttention.create_relative_position_index  s-   << 0 0 34<< 0 0 34U^^Xx,@4PQvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"9&&r(   NF)rL   rM   rN   rm   r?   TensorrP   boolrR   r|   r   r}   r~   s   @r&   r   r   ]  s^    G: 48).	1||1 ))D01  $;	1
 
u||	1f'r(   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapAudioSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rk   )rl   rm   r   r   denser   r   r   rd   r   r   ro   s      r&   rm   zClapAudioSelfOutput.__init__  s6    YYsC(
zz&"E"EFr(   r    input_tensorr;   c                 J    | j                  |      }| j                  |      }|S rk   r  r   rd   r    r	  s      r&   r|   zClapAudioSelfOutput.forward  s$    

=1]3r(   rL   rM   rN   rm   r?   r  r|   r}   r~   s   @r&   r  r    s2    G
U\\  RWR^R^ r(   r  c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	ClapAudioAttentionc                 j    t         |           t        ||||      | _        t	        ||      | _        y rk   )rl   rm   r   rd   r  r{   r   s        r&   rm   zClapAudioAttention.__init__  s.    *63	;O	)&#6r(   Nr    r   r   r;   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   rd   r{   )rd   r    r   r   self_outputsattention_outputr   s          r&   r|   zClapAudioAttention.forward  sE     yy@QR;;|AF#%QR(88r(   r  )rL   rM   rN   rm   r?   r  rP   r  rR   r|   r}   r~   s   @r&   r  r    sW    7 48).		||	 ))D0	  $;		
 
u||		r(   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rk   )rl   rm   r   r   r   	mlp_ratior  r   
hidden_actstrr	   intermediate_act_fnr  s      r&   rm   zClapAudioIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r(   r    r;   c                 J    | j                  |      }| j                  |      }|S rk   r  r  rd   r    s     r&   r|   zClapAudioIntermediate.forward  &    

=100?r(   r  r~   s   @r&   r  r    #    9U\\ ell r(   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y rk   )
rl   rm   r   r   r   r  r  r   hidden_dropout_probr   r  s      r&   rm   zClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r(   r    r;   c                 J    | j                  |      }| j                  |      }|S rk   r  r  s     r&   r|   zClapAudioOutput.forward  s$    

=1]3r(   r  r~   s   @r&   r"  r"    s#    >
U\\ ell r(   r"  c                        e Zd Zd fd	Zd Zd Zd Z	 	 ddej                  de	e
e
f   dedz  d	edz  d
e	ej                  ej                  f   f
dZ xZS )ClapAudioLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)r2   rq   )rl   rm   chunk_size_feed_forward
shift_sizer2   input_resolutionr   r   layer_norm_epslayernorm_beforer  	attentionri   r   	drop_pathlayernorm_afterr  intermediater"  r{   )rd   r   r   r-  r   drop_path_rater,  ro   s          r&   rm   zClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a9G#9Mn5SUS^S^S`!||CV5J5JK1&#>%fc2r(   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr2   r   r,  r?   jit
is_tracingtensor)rd   r-  s     r&   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_size  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r(   c           	         | j                   dkD  rht        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  d      j                  |dk(  d      }|S d }|S )Nr   r   rr   r-   r*   g      Yrq   )	r,  r?   r   slicer2   r7   r/   r   masked_fill)rd   r3   r4   rs   r>   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r&   get_attn_maskzClapAudioLayer.get_attn_mask  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir(   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS r6  )r2   r   rB   r   )rd   r    r3   r4   	pad_right
pad_bottom
pad_valuess          r&   	maybe_padzClapAudioLayer.maybe_pad+  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r(   r    input_dimensionsr   Nalways_partitionr;   c                    |s| j                  |       n	 |\  }}|j                         \  }}}	|}
| j                  |      }|j                  ||||	      }| j	                  |||      \  }}|j
                  \  }}}}| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |	      }| j                  |||j                  |j                        }| j                  |||      }|d   }|j                  d| j                  | j                  |	      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |	      }|
| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r*   )shiftsdimsr-   rr   )r   r   r,   r   )r;  r   r/  r/   rL  r   r,  r?   rollr7   r2   rG  rs   r>   r0  r9   r1   r1  r2  r3  r{   )rd   r    rM  r   rN  r3   r4   r"   r   r   shortcutrK  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrF  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r&   r|   zClapAudioLayer.forward2  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN+@)_pNq,Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr(   )rq   r   FF)rL   rM   rN   rm   r;  rG  rL  r?   r  rR   r   r  r|   r}   r~   s   @r&   r'  r'    sz    38) */(->||>  S/>  $;	>
 +> 
u||U\\)	*>r(   r'  c                        e Zd Z fdZ	 	 d	dej
                  deeef   dedz  dedz  deej
                     f
dZ	 xZ
S )
ClapAudioStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr*   r   )r   r   r-  r   r4  r,  )r   
norm_layerF)rl   rm   r   r   r   
ModuleListranger'  r2   blocksr   
downsamplepointing)
rd   r   r   r-  depthr   r1  rf  iro   s
            r&   rm   zClapAudioStage.__init__u  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r    rM  r   NrN  r;   c                    |\  }}t        | j                        D ]  \  }} |||||      }	|	d   } |}
| j                  )|dz   dz  |dz   dz  }}||||f}| j                  |
|      }n||||f}||
|f}|r|	dd  z  }|S )Nr   r   r*   )	enumeratere  rf  )rd   r    rM  r   rN  r3   r4   ri  layer_moduler]  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                  r&   r|   zClapAudioStage.forward  s     )(5 	-OA|(8HJ[]mnM)!,M	-
 -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr(   r^  )rL   rM   rN   rm   r?   r  rR   r   r  r|   r}   r~   s   @r&   r`  r`  t  sb    < */(-||  S/  $;	
 + 
u||	r(   r`  c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    r-  r   rb  r;   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr+   r*   Fr   )rl   rm   r-  r   r   r   	reductionr   )rd   r-  r   rb  ro   s       r&   rm   zClapAudioPatchMerging.__init__  sI     01s7AG%@q3w'	r(   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr*   r   r   )r   rB   r   )rd   input_featurer3   r4   
should_padrK  s         r&   rL  zClapAudioPatchMerging.maybe_pad  sU    qjAo:519>
Q519a!<JMM--mZHMr(   rw  rM  c                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r*   r   r-   r+   )r   r/   rL  r?   catr   ru  )rd   rw  rM  r3   r4   r"   r   r5   input_feature_0input_feature_1input_feature_2input_feature_3s               r&   r|   zClapAudioPatchMerging.forward  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r(   )rL   rM   rN   rO   r   r   rR   r   Modulerm   rL  r?   r  r|   r}   r~   s   @r&   rs  rs    sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r(   rs  c                        e Zd Z fdZd Z	 	 	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	edz  d
ee	z  fdZ
 xZS )ClapAudioEncoderc                    t         |           t        |j                        | _        || _        t        |      | _        |j                  | _        | j                  j                  | _	        |j                  | _
        |j                  |j                  z  | _        t        |j                  d| j                  dz
  z  z        | _        t!        j"                  d|j$                  t'        |j                        d      D cg c]  }|j)                          }}| j                  j*                  }t-        | j                        D cg c]  }|d   d|z  z  |d   d|z  z  f c}| _        t1        j2                  t-        | j                        D cg c]  }t5        |t        |j                  d|z  z        | j.                  |   |j                  |   |j6                  |   |t'        |j                  d |       t'        |j                  d |dz           || j                  dz
  k  rt8        nd        c}      | _        d| _        t1        j>                  |j                        | _         t1        jB                  | j                        | _"        |j                  | _        t1        jF                  d      | _$        y c c}w c c}w c c}w )Nr*   r   r   cpur=   )r   r   r-  rh  r   r1  rf  F)%rl   rm   rA   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior   r   num_featuresr?   linspacer4  r   itemr   rd  input_resolutionsr   rc  r`  r   rs  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)rd   r   xr4  r   ri  i_layerro   s          r&   rm   zClapAudioEncoder.__init__  sW   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vwq!&&(ww$$..	\abfbqbq\r!sWX9Q<AqD#99Q<AqD;Q"R!smm  %T__5  !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@4??UVCV9V4]a
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   J<KB#Kc                    |j                   \  }}}}t        | j                  | j                  z        }| j                  | j                  z  }||kD  s||kD  rt	        d      ||k  r%t
        j                  j                  |||fdd      }||k  r%t
        j                  j                  |||fdd      }|j                   \  }}}	}
|j                  ||| j                  z  |	| j                  z  |
      }|j                  dddd      j                         }|j                  |||
| j                  z  |	| j                  z        }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r   r*   )r   r   r   r  r   r   rB   r'   r   r0   r1   )rd   normalized_input_featuresr   r#   freq_length
spec_widthspec_heightbatchr   timefreqs              r&   reshape_mel2imgz ClapAudioEncoder.reshape_mel2img  s`   
 *C)H)H&1k;$//9:
nn7#{['@_`` #(*(A(A)J+D9dh )B )% $(*(A(A)K+EIei )B )% '@&E&E#xt %>$E$E8doo-tt/F%
! %>$E$EaAq$Q$\$\$^!$=$E$E8TDOO3TT__5L%
! )(r(   N	is_longerr   output_hidden_states(output_hidden_states_before_downsamplingrN  return_dictr;   c                    |j                  dd      }| j                  |      }|j                  dd      }d }	| j                  r6|j                  |j                        }
t        j                  |
dk(        d   }	| j                  |      }|j                  d   }| j                  ||	      }|rdnd }|rdnd }|rdnd }| j                  d   }|rE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }}| j                  |   } |||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                  \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }|s||dd  z  } | j                  |      }|j                  \  }}}|dt!        | j"                        dz
  z  z  | j$                  d   z  }|dt!        | j"                        dz
  z  z  | j$                  d   z  }|j                  ddd      j'                         j)                  ||||      }|j                  \  }}}}|| j*                  z  } |j)                  |||| z  | |      }|j                  ddddd      j'                         j)                  ||| d      }| j-                  t        j.                  |d            }!t        j.                  |!d      }!|st1        d	 ||!||fD              S t3        ||!||
      S )Nr   r   r   r*   rS   r   r-   r+   c              3   $   K   | ]  }|| 
 y wrk   rS   )rb   vs     r&   re   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s      	 = 	s   rJ   pooler_outputr    rK   )r   r  r   tor>   r?   wherer  r   r  r  r/   r0   rk  r  r   rA   r  r   r1   r   r  r  r   rR   r   )"rd   input_featuresr  r   r  r  rN  r  r  is_longer_list_idxis_longer_listr    
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrM  r"   r   hidden_sizereshaped_hidden_stateri  rl  r]  rm  rp  rJ   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs"                                     r&   r|   zClapAudioEncoder.forward/  sh    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((8JK"6BD+?RT"$5b411!4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 	9OA|#55a8(8HJ[]mnM)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#?	9B !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r(   )NFFFFT)rL   rM   rN   rm   r  r?   rP   r  rR   rU   r|   r}   r~   s   @r&   r  r    s    &/P")N /3).,1@E(-#'p
 $$t+p
  $;	p

 #Tkp
 37+p
 +p
 D[p
 
%	%p
r(   r  c                   0     e Zd Zdeez  f fdZd Z xZS )ClapProjectionLayerr   c                     t         |           || _        |j                  }|j                  }t        j                  ||      | _        t        |j                     | _
        t        j                  ||      | _        y rk   )rl   rm   r   r  projection_dimr   r   linear1r	   projection_hidden_act
activationlinear2)rd   r   r  r  ro   s       r&   rm   zClapProjectionLayer.__init__  sa    ((..yyn= !=!=>yy@r(   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rk   )r  r  r  r  s     r&   r|   zClapProjectionLayer.forward  s2    ]36]3r(   )rL   rM   rN   r   r   rm   r|   r}   r~   s   @r&   r  r    s    A? Ar(   r  c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )ClapTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxr)  position_idsr   r-   T)
persistenttoken_type_ids)rs   )rl   rm   r   	Embedding
vocab_sizer  pad_token_idword_embeddingstype_vocab_sizetoken_type_embeddingsr   r.  r   r$  r   r   r?   r@   max_position_embeddingsexpandr   r  r   longr  position_embeddingsrd   r   ro   s     r&   rm   zClapTextEmbeddings.__init__  s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	
 "..#%<<**F,>,>DL\L\$
 r(   N	input_idsr  r  inputs_embedspast_key_values_lengthr;   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )Nr-   r  r   r   )r   indexrr   )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   hasattrr  r  r   r?   gatherr   r  r  r>   r  r  r  r   r   )rd   r  r  r  r  r  input_shaper"   
seq_lengthbuffered_token_type_idsr  
embeddingsr  s                r&   r|   zClapTextEmbeddings.forward  sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r(   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr-   r   rr   r   )r   r?   r@   r  r>   r   r  )r  r  r  sequence_lengthr  s        r&   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embeds  sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r(   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r   )ner   r?   cumsumtype_asr  )r  r  r  maskincremental_indicess        r&   r  z5ClapTextEmbeddings.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r(   )NNNNr   )r   )rL   rM   rN   rO   rm   r?   
LongTensorrP   r   r  r|   staticmethodr  r  r}   r~   s   @r&   r  r    s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r(   r  moduler   r   r   r   scalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr*   r   r-   )r   rs   )prt   r   )r?   r   r   r   rB   r   float32r  rs   r   rt   r1   )
r  r   r   r   r   r  r   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr     s     <<s}}Q':;gEL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r(   c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )
ClapTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizer   r   r         )rl   rm   r  r   r  r   r   r   r   r   r   r   r   r   r   r   r   r   attention_dropoutr  r  s     r&   rm   zClapTextSelfAttention.__init__8  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r(   Nr    r   r   r  r;   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  sdn| j                  | j                  d|\  }} |j                  g |d j!                         }|r||f}|S |f}|S )Nr-   r   r*   rq   )r   r  )r   r   r   r/   r   r   r   r   get_interfacer   _attn_implementationr  rt   r  r  r   r1   )rd   r    r   r   r  r  r   query_states
key_statesvalue_statesattention_interfacer  r  r   s                 r&   r|   zClapTextSelfAttention.forwardM  sT    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFH1B;- JUr(   r  rL   rM   rN   rm   r?   r  rP   r  r   r   rR   r|   r}   r~   s   @r&   r  r  7  sg    60 48).	|| ))D0  $;	
 +, 
u||	r(   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr)  )rl   rm   r   r   r  r  r   r.  r   r$  r   r  s     r&   rm   zClapTextSelfOutput.__init__q  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r(   r    r	  r;   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rk   r  r   r   r  s      r&   r|   zClapTextSelfOutput.forwardw  7    

=1]3}|'CDr(   r  r~   s   @r&   r  r  p  1    >U\\  RWR^R^ r(   r  c                        e Zd Z fdZ	 	 d	dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )
ClapTextAttentionc                 b    t         |           t        |      | _        t	        |      | _        y rk   )rl   rm   r  rd   r  r{   r  s     r&   rm   zClapTextAttention.__init__  s&    )&1	(0r(   Nr    r   r   r  r;   c                 n     | j                   |f||d|}| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r  )rd   r    r   r   r  r  r  r   s           r&   r|   zClapTextAttention.forward  s\     !tyy
)/
 	
  ;;|AF#%QR(88r(   r  r  r~   s   @r&   r  r    sg    1 48).	|| ))D0  $;	
 +, 
u||	r(   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rk   )rl   rm   r   r   r  intermediate_sizer  r   r  r  r	   r  r  s     r&   rm   zClapTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r(   r    r;   c                 J    | j                  |      }| j                  |      }|S rk   r  r  s     r&   r|   zClapTextIntermediate.forward  r  r(   r  r~   s   @r&   r  r    r   r(   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )rl   rm   r   r   r  r  r  r   r.  r   r$  r   r  s     r&   rm   zClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r(   r    r	  r;   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rk   r
  r  s      r&   r|   zClapTextOutput.forward  r  r(   r  r~   s   @r&   r  r    r  r(   r  c                        e Zd Z fdZ	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZd	 Z xZS )ClapTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
rl   rm   r+  seq_len_dimr  r0  r  r3  r  r{   r  s     r&   rm   zClapTextLayer.__init__  sI    '-'E'E$*6208$V,r(   Nr    r   r   r  r;   c                      | j                   |f||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S r  )r0  r   feed_forward_chunkr+  r  )	rd   r    r   r   r  self_attention_outputsr  r   r\  s	            r&   r|   zClapTextLayer.forward  s     "0"
)/"
 	"
 2!4(,0##T%A%A4CSCSUe
  /G+r(   c                 L    | j                  |      }| j                  ||      }|S rk   )r3  r{   )rd   r  intermediate_outputr\  s       r&   r   z ClapTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr(   r  )rL   rM   rN   rm   r?   r  rP   r  r   r   rR   r|   r   r}   r~   s   @r&   r  r    sl    - 48).	|| ))D0  $;	
 +, 
u||	.r(   r  c                        e Zd Z fdZe	 	 	 	 ddej                  dej                  dz  dedz  dedz  dedz  de	e
   d	eej                     ez  fd
       Z xZS )ClapTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
rl   rm   r   r   rc  rd  num_hidden_layersr  layerr  )rd   r   ri  ro   s      r&   rm   zClapTextEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#Nr    r   r   r  r  r  r;   c                     |rdnd }|rdnd }t        | j                        D ])  \  }	}
|r||fz   } |
|||fi |}|d   }|s!||d   fz   }+ |r||fz   }t        |||      S )NrS   r   r   )rJ   r    rK   )rk  r(  r   )rd   r    r   r   r  r  r  r  r  ri  rl  r]  s               r&   r|   zClapTextEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!(! 	M *!,M &9]1=M<O&O#	P   1]4D D++*
 	
r(   )NFFT)rL   rM   rN   rm   r   r?   r  rP   r  r   r   rR   r   r|   r}   r~   s   @r&   r%  r%    s    ,  48).,1#'"
||"
 ))D0"
  $;	"

 #Tk"
 D["
 +,"
 
u||		."
 "
r(   r%  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rk   )rl   rm   r   r   r  r  Tanhr  r  s     r&   rm   zClapTextPooler.__init__  s9    YYv1163E3EF
'')r(   r    r;   c                 \    |d d df   }| j                  |      }| j                  |      }|S r6  )r  r  )rd   r    first_token_tensorpooled_outputs       r&   r|   zClapTextPooler.forward  s6     +1a40

#566r(   r  r~   s   @r&   r+  r+    s#    $
U\\ ell r(   r+  c                   l    e Zd ZU eed<   dZdZdZ ej                         de
j                  fd       Zy)ClapPreTrainedModelr   clap)audiotextFr  c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                          yt        |t"              rt	        j$                  |j&                  t)        j*                  | j                   j,                               t	        j$                  |j.                  t)        j*                  | j                   j,                               yt        |t0        j2                        r&t	        j
                  |j                  d|dz         yt        |t0        j4                  t0        j6                  f      rt	        j                  |j8                         t	        j:                  |j                         t=        |dd      ^t	        j                  |j>                         t	        j:                  |j@                         t	        j                  |jB                         yyt        |t0        jD                  t0        jF                  f      r| j                   jH                  dz  d	| j                   jJ                  z  dz  z  |z  }t	        j
                  |j                  |
       |j8                   t	        j                  |j8                         yyt        |tL              rNt	        j                  |jN                         t	        j                  |jP                  |jS                                yy)zInitialize the weightsrq   g{Gz?)meanstdr-   r  running_meanNr  r*   )r8  )*r   initializer_factorr   r  initnormal_r  weightr  copy_r  r?   r@   r   r  zeros_r  	ClapModel	constant_logit_scale_ar   loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   ones_r`   r9  running_varnum_batches_trackedr   r   r  r'  r   r   r   r   )rd   r  factorin_proj_stds       r&   _init_weightsz!ClapPreTrainedModel._init_weights"  sj    //f01LL33::&SW-XLL55<<3FUYMZJJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--.	*NN6//$++:\:\1]^NN6//$++:\:\1]^-LLSftmDr~~ >?KK$JJv}}%v~t4@F//0

6--.F667 A BII 67;;22D8a$++B_B_>_dh=hilrrKLLK8{{&FKK( ' 67KK;;<JJv55v7\7\7^_ 8r(   N)rL   rM   rN   r   rQ   base_model_prefixinput_modalitiessupports_gradient_checkpointingr?   no_gradr   r  rK  rS   r(   r&   r2  r2    sB    (&+#U]]_`BII ` `r(   r2  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd       Z xZS )ClapAudioModelr   r  r4  c                 d    t         |   |       t        |      | _        | j	                          y rk   )rl   rm   r  audio_encoder	post_initr  s     r&   rm   zClapAudioModel.__init__G  s'     -f5r(   r;   c                 B    | j                   j                  j                  S rk   )rS  r  r   rg   s    r&   get_input_embeddingsz#ClapAudioModel.get_input_embeddingsM  s    !!--222r(   Nr  r   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      S )ad  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```r  r  r   r  r  )r   use_return_dictr   r  rS  )rd   r  r  r   r  r  r  s          r&   r|   zClapAudioModel.forwardP  sy    @ &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r(   NNNNN)rL   rM   rN   r   rQ   main_input_namerM  rm   r   r  rV  r   r?   rP   
BoolTensorr  rR   r   r|   r}   r~   s   @r&   rQ  rQ  B  s    &O 3bii 3  48-1)-,0#'+
))D0+
 ##d*+
  $;	+

 #Tk+
 D[+
 
+	++
 +
r(   rQ  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   B    e Zd ZU eed<   dZd fd	Zd Zd Ze	e
	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  deej                     ez  fd              Z xZS )ClapTextModelr   r5  c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rl   rm   r   r  r  r%  encoderr+  poolerrT  )rd   r   add_pooling_layerro   s      r&   rm   zClapTextModel.__init__  sM    
 	 ,V4&v.0AnV,t 	r(   c                 .    | j                   j                  S rk   r  r  rg   s    r&   rV  z"ClapTextModel.get_input_embeddings  s    ...r(   c                 &    || j                   _        y rk   re  rd   r   s     r&   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'r(   Nr  r   r  r  r  r   r  r  r;   c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j                  ||||      }| j#                  ||||d	      }|d
   }| j$                  | j%                  |      nd }t'        |||j(                  |j*                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr=   r  rr   )r  r  r  r  T)r   r   r  r  r   r  )r   r   r  rY  r   %warn_if_padding_and_no_attention_maskr   r>   r?   onesr  r  r  r  r   r  get_extended_attention_maskra  rb  r   r    rK   )rd   r  r   r  r  r  r   r  r  r  r  r"   r  r>   r   buffered_token_type_ids_expandedextended_attention_maskembedding_outputencoder_outputssequence_outputr0  s                        r&   r|   zClapTextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r(   )T)NNNNNNNN)rL   rM   rN   r   rQ   rM  rm   rV  rh  r   r   r?   r  r  rR   r   r|   r}   r~   s   @r&   r^  r^    s      /0  *..2.2,0-1)-,0#'C
<<$&C
 t+C
 t+	C

 llT)C
 ||d*C
  $;C
 #TkC
 D[C
 
u||	K	KC
  C
r(   r^  c                   <    e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zee	 	 dd
ej                  dej                  dz  dej                  dz  de
e   deez  f
d              Zee	 	 	 	 	 	 	 	 	 ddej                   dz  d
ej"                  dz  dej$                  dz  dej                  dz  dej                   dz  dedz  dedz  dedz  dedz  deez  fd              Z xZS )r@  r   c                 .   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  t        j                  t        j                  |j                                    | _        t        j                  t        j                  t        j                  |j                                    | _        |j$                  | _        t'        |      | _        t+        |      | _        t/        |      | _        t+        |      | _        | j5                          y )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )rl   rm   r   text_configr   	TypeErrortypeaudio_configr   r   r   r?   r:  r   rC  rD  rB  rE  r  r^  
text_modelr  text_projectionrQ  audio_modelaudio_projectionrT  )rd   r   ru  rx  ro   s       r&   rm   zClapModel.__init__  s=    &,,n=++,-Q0 
 &--?,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r(   Nr  r   r  r  r;   c                      | j                   d|||dd|}| j                  |j                        }t        j                  |d      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r  r   r  r  r-   r   rS   )ry  rz  r  F	normalize)rd   r  r   r  r  text_outputstext_featuress          r&   get_text_featureszClapModel.get_text_features  sa    . 4C4?? 4
)%	4

 4
 ,,\-G-GH%&[[B%G"r(   r  r  c                      | j                   d||dd|}| j                  |j                        }t        j                  |d      |_        |S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))

        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     audio_features = model.get_audio_features(**inputs)
        ```T)r  r  r  r-   r   rS   )r{  r|  r  r~  r  )rd   r  r  r   r  audio_outputsaudio_featuress          r&   get_audio_featureszClapModel.get_audio_features6  s\    8 5ED4D4D 5
)YD5
TZ5
 ..}/J/JK&'kk.b&I#r(   return_lossr   r  r  c
           	      l   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  ||||d      }| j                  |||||d      }|	s|d   n|j                  }| j                  |      }|	s|d   n|j                  }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }| j                  j                         }t        j                  ||j                               |z  }t        j                  ||j                               |z  }d}|r,t!        |      }t!        |j                               }||z   d	z  }t#        |||||||
      S )a  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vacuum cleaner"]

        >>> inputs = processor(text=input_text, audio=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```NTrX  r  r   r  r   r  r  r   r*   r-   )r  r   keepdimg       @)rY   rZ   r[   rI   rV   r\   r]   )r   r   r  rY  r{  ry  r  r|  rz  r   rE  exprB  r?   r   trE   rX   )rd   r  r  r  r   r  r  r   r  r  r  r  r  rV   rI   logit_scale_textlogit_scale_audior[   rZ   rY   caption_loss
audio_losss                         r&   r|   zClapModel.forwardZ  s   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5 ) 
 )%/!5 ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  --113 ..224,,{LNN4DEHXX <<kmmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r(   )NN)	NNNNNNNNN)rL   rM   rN   r   rQ   rm   r   r   r?   r  r   r   rR   r   r  r  r  rP   r\  r  rX   r|   r}   r~   s   @r&   r@  r@    s   z @  /3,0	<< t+ llT)	
 +, 
+	+  B  *..2	   <<$&  t+	 
 +,  
+	+    D  .237-1.204#')-,0#'^
##d*^
 ))D0^
 ##d*	^

 t+^
 &&-^
 D[^
  $;^
 #Tk^
 D[^
 
	^
  ^
r(   r@  c                       e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
ee	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  deez  fd              Z xZS )ClapTextModelWithProjectionr   r_  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rk   )rl   rm   r^  ry  r  rz  rT  r  s     r&   rm   z$ClapTextModelWithProjection.__init__  s3     '/26:r(   r;   c                 B    | j                   j                  j                  S rk   ry  r  r  rg   s    r&   rV  z0ClapTextModelWithProjection.get_input_embeddings  s    ))999r(   c                 :    || j                   j                  _        y rk   r  rg  s     r&   rh  z0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r(   Nr  r   r  r   r  r  c                    ||n| j                   j                  }| j                  |||||d      }|s|d   n|j                  }	| j	                  |	      }
t        |
|j                  |j                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```Tr  r   )rI   rJ   r    rK   )	r   rY  ry  r  rz  rH   rJ   r    rK   )rd   r  r   r  r   r  r  r  r  r0  rI   s              r&   r|   z#ClapTextModelWithProjection.forward  s    4 &1%<k$++B]B])%/!5 ' 
 0;Q@Z@Z**=9"#*<<&44#..	
 	
r(   )NNNNNN)rL   rM   rN   r   rQ   rM  rm   r   r  rV  rh  r   r   r?   r  r  rR   rH   r|   r}   r~   s   @r&   r  r    s     ~ :bii :;  *..2,0)-,0#',
<<$&,
 t+,
 llT)	,

  $;,
 #Tk,
 D[,
 
$	$,
  ,
r(   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 	 	 	 ddej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd              Z xZS )ClapAudioModelWithProjectionr   r  r4  c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rk   )rl   rm   rQ  r{  r  r|  rT  r  s     r&   rm   z%ClapAudioModelWithProjection.__init__  s4     )&1 3F ;r(   r;   c                 V    | j                   j                  j                  j                  S rk   )r{  rS  r  r   rg   s    r&   rV  z1ClapAudioModelWithProjection.get_input_embeddings  s     --99>>>r(   Nr  r   r  r  c                 l   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||d      }|s|d   n|j
                  }| j                  |      }	t        |	|j                  |j                  |j                        S )au  
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audio=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```TrX  r   )rV   rJ   rK   r    )r   rY  r   r  r{  r  r|  rU   rJ   rK   r    )
rd   r  r  r   r  r  r  r  r0  rV   s
             r&   r|   z$ClapAudioModelWithProjection.forward  s    @ &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5 ) 
 1<a(A\A\,,];#%+==$//'55	
 	
r(   rZ  )rL   rM   rN   r   rQ   r[  rM  rm   r   r  rV  r   r   r?   rP   r\  r  rR   rU   r|   r}   r~   s   @r&   r  r     s    &O ?bii ?  48-1)-,0#'5
))D05
 ##d*5
  $;	5

 #Tk5
 D[5
 
%	%5
  5
r(   r  )r@  r2  r^  r  rQ  r  )rq   )VrO   r   r   collections.abcr   dataclassesr   typingr   r?   torch.nn.functionalr   rB   r~   r   r;  activationsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_clapr   r   r   
get_loggerrL   loggerr'   r7   r9   r  rE   rH   rU   rX   r  ri   r   r   r   r  r  r  r"  r'  r`  rs  r  r  r  floatr  r  r  r  r  r  r  r%  r+  r2  rQ  r^  r@  r  r  __all__rS   r(   r&   <module>r     s      $ !      & ! 9 
 G & 6 j j K K 
		H	%"**7U\\ 7ell 7
 	<+ 	< 	< 
	<; 	< 	<  
  
   
H299 2%		 %P_")) _FZ'RYY Z'|
")) 
 &BII  	bii 	wRYY wv4/ 4p3BII 3l}
ryy }
@")) &g8 g8d %II%<<% 
% <<	%
 LL4'% % %.5BII 5r 		 2299  RYY #. #N*
bii *
\RYY  #`/ #` #`L:
( :
z _
' _
_
D J
# J
 J
Z ?
"5 ?
 ?
D F
#6 F
 F
Rr(   