
    qiR                        d Z ddlZddlmZ ddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z& ddl'm(Z(  e!jR                  e*      Z+d Z,d Z-d Z.d Z/d Z0 G d dej
                  jb                        Z2 G d dejb                        Z3 G d dejb                        Z4	 	 dHdejb                  dejj                  dejj                  d ejj                  d!ejj                  dz  d"e6dz  d#e6d$ee   fd%Z7 G d& d'ejb                        Z8 G d( d)ejb                        Z9 G d* d+ejb                        Z: G d, d-ejb                        Z; G d. d/ejb                        Z< G d0 d1e      Z= G d2 d3ejb                        Z> G d4 d5ejb                        Z?e G d6 d7e             Z@e G d8 d9e@             ZAe G d: d;e@             ZB G d< d=ejb                        ZC ed>?       G d@ dAe@             ZDe G dB dCe@             ZE G dD dEejb                        ZFdF ZGg dGZHy)IzPyTorch ESM model.    N)Callable)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )	EsmConfigc                 b    | j                  dd      \  }}t        j                  | |fd      S )N   dim)chunktorchcat)xx1x2s      V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr*   -   s/    WWQBWFB99rc2YB''    c                     |d d d d d | j                   d   d d f   }|d d d d d | j                   d   d d f   }| |z  t        |       |z  z   S )N)shaper*   )r&   cossins      r)   apply_rotary_pos_embr1   2   sX    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r+   c                 j    | dz  dt        j                  | t        j                  d      z        z   z  S )zo
    This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
    g      ?      ?g       @)r$   erfmathsqrtr&   s    r)   gelur8   9   s.     s7cEIIa$))C.&899::r+   c                 ,    | | j                  dd      z   S )zJMake layer symmetric in final two dimensions, used for contact prediction.r    r-   )	transposer7   s    r)   
symmetrizer;   @   s    q{{2r"""r+   c                     | j                  dd      }| j                  dd      }| j                  dd      }||z  }|j                  |       | |z
  }|S )z=Perform average product correct, used for contact prediction.r    T)keepdimsr-   )r    r-   )sumdiv_)r&   a1a2a12avg
normalizeds         r)   average_product_correctrE   E   s[    	
rD	!B	
rD	!B
%%4%
(C
r'CHHSMSJr+   c                        e Zd ZU dZej
                  ed<   def fdZd
dZ	dej
                  dej
                  de
ej
                  ej
                  f   fd	Z xZS )RotaryEmbeddingz
    Rotary position embeddings based on those in
    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
    matrices which depend on their relative positions.
    inv_freqr"   c                     t         |           || _        ddt        j                  d|dt        j
                        j                         |z  z  z  }| j                  d|       d | _        d | _	        d | _
        y )Nr3   '  r   r   dtyperH   )super__init__r"   r$   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr"   rH   	__class__s      r)   rN   zRotaryEmbedding.__init__Z   sn    %ELLC%++$N$T$T$VY\$\]^Z2#r+   c                 t   |j                   |   }|| j                  k7  s#| j                  j                  |j                  k7  r|| _        t	        j
                  |j                   |   |j                        j                  | j                        }t	        j                  || j                        }t	        j                  ||fd      j                  |j                        }|j                         d d d d d d f   | _        |j                         d d d d d d f   | _        | j                  | j                  fS )Ndevicer    r!   )r.   rS   rT   rZ   r$   rO   type_asrH   outerr%   tor/   r0   rU   )rV   r&   seq_dimensionseq_lentfreqsembs          r)   _update_cos_sin_tablesz&RotaryEmbedding._update_cos_sin_tablese   s    ''-( d***d.>.>.E.E.Q#*D QWW]3AHHEMMdmm\AKK4==1E))UEN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r+   qkreturnc                 .   | j                  |d      \  | _        | _        t        || j                  | j                        j	                  |j
                        t        || j                  | j                        j	                  |j
                        fS )Nr-   )r^   rK   )rc   rT   rU   r1   r]   rL   )rV   rd   re   s      r)   forwardzRotaryEmbedding.forwardu   s    -1-H-HZ\-H-]*$* !D$4$4d6F6FGJJQRQXQXJY D$4$4d6F6FGJJQRQXQXJY
 	
r+   )r   )__name__
__module____qualname____doc__r$   Tensor__annotations__intrN   rc   tuplerh   __classcell__rW   s   @r)   rG   rG   Q   sY     ll	 C 	 2 
 
%,, 
5u||A[;\ 
r+   rG   c                   8     e Zd ZdZ	 	 ddedef fdZd Z xZS )EsmContactPredictionHeadzWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                     t         |           || _        || _        t	        j
                  |d|      | _        t	        j                         | _        y )Nr   )	rM   rN   ru   rv   r   Linear
regressionSigmoid
activation)rV   ru   biasrv   rW   s       r)   rN   z!EsmContactPredictionHead.__init__   s@     	&))KD9**,r+   c                 X   |j                  | j                        j                  |      }|j                  d      |j                  d      z  }||d d d d d d d d f   z  }|dd dd df   }|ddd dd f   }|j	                         \  }}}}}|j                  |||z  ||      }|j                  | j                  j                  j                        }t        t        |            }|j                  dddd      }| j                  | j                  |      j                  d            S )Nr   r   .r    r   r   )nerv   r]   	unsqueezesizeviewry   weightrZ   rE   r;   permuter{   squeeze)	rV   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r)   rh   z EsmContactPredictionHead.forward   s!   99T\\*--j9%%a(8+=+=a+@@(1dD!Q+>"??
SbS#2#.
QR,
/9/@,
FE61__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr+   )Tr   )ri   rj   rk   rl   ro   rN   rh   rq   rr   s   @r)   rt   rt   ~   s+    a
 	
'
' 	
'Gr+   rt   c                   8     e Zd ZdZ fdZ	 	 	 	 ddZd Z xZS )EsmEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        |j                  r1t        j                  |j
                  |j                        | _        nd | _        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j(                        j+                  d      d       |j                  | _        | j                   dk(  r;t        j                  |j(                  |j
                  | j,                        | _        |j0                  | _        |j2                  | _        y )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   r    F)
persistent)rM   rN   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rR   r$   rO   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrV   configrW   s     r)   rN   zEsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r+   c                    |*|t        || j                        }n| j                  |      }|| j                  |      }|}| j                  r||j                  || j                  k(  j                  d      d      }d}||j                  d      n|j                  d   }|| j                  k(  j                  d      j                         |z  }|d|z
  z  d|z
  d d d d f   z  j                  |j                        }| j                  dk(  r| j                  |      }	||	z   }| j                  | j                  |      }|-||j                  d      z  j                  |j                        }|S )Nr            gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r   r>   r.   rQ   r]   rL   r   r   r   )
rV   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r)   rh   zEsmEmbeddings.forward   s    $A)TM]M]^#JJ=Y  00;M #
 )"7#//d>P>P1P0[0[\^0_adeJ)4B4N.,,R0T]TcTcdeTfK#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#&99J??&4J%$~'?'?'CCGG
HXHXYJ r+   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr    r   )rL   rZ   r   )r   r$   rO   r   longrZ   r   r   )rV   r   input_shapesequence_lengthr   s        r)   r   z4EsmEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )NNNN)ri   rj   rk   rl   rN   rh   r   rq   rr   s   @r)   r   r      s&    22 /b=r+   r   modulequerykeyvaluer   scalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr          r   r   r!   )ptrainingr   )
r   r$   matmulr:   r   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r)   eager_attention_forwardr     s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r+   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dee   de	ej
                     fd	Z
 xZS )EsmSelfAttentionNc                    t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        |j                  | _        d | _        |xs t%        |dd      | _        | j&                  dk(  rt)        | j                  	      | _        d
| _        |j,                  | _        || _        | j,                  xr | | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   rotaryr!   r3   )rM   rN   r   r   num_attention_headshasattr
ValueErrorro   attention_head_sizeall_head_sizer   rx   r   r   r   attention_probs_dropout_probr   rotary_embeddingsr   r   rG   r   
is_decoder	layer_idx	is_causal)rV   r   r   r   is_cross_attentionrW   s        r)   rN   zEsmSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
::!%'> (
'-zC
$ ''83%49Q9Q%RD" ++"C1C-Cr+   hidden_statesr   encoder_hidden_statesencoder_attention_maskr   rf   c                    |j                   d d \  }}||d| j                  f}| j                  |      j                  |      j	                  dd      }	|d u}
|
r|n|}|
r|n|}| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }|	| j                  dz  z  }	| j                  dk(  r| j                  |	|      \  }	}t        j                  | j                  j                  t              } || |	|||f| j                  sdn| j                  | j                   d|\  }}|j#                  ||d      j%                         }||fS )Nr    r   r   r   r   r   )r   r   )r.   r   r   r   r:   r   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   reshaper   )rV   r   r   r   r   r   r   
seq_lengthhidden_shapequery_layerr   current_states	key_layervalue_layerattention_interfacer   r   s                    r)   rh   zEsmSelfAttention.forward>  s    "/!4!4Sb!9
J"JD4L4LMjj/44\BLLQPQR2$>2D.-3E/>HH^,11,?II!QO	jj055lCMMaQRS "D$<$<d$BB''83%)%;%;K%S"K(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPRL((r+   )NNFNNN)ri   rj   rk   rN   r$   rm   FloatTensorr   r   rp   rh   rq   rr   s   @r)   r   r     s    DF 48:>;?,)||,) ))D0,)  %0047	,)
 !& 1 1D 8,) +,,) 
u||	,)r+   r   c                   $     e Zd Z fdZd Z xZS )EsmSelfOutputc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	rM   rN   r   rx   r   denser   r   r   r   s     r)   rN   zEsmSelfOutput.__init__n  sB    YYv1163E3EF
zz&"<"<=r+   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   rV   r   input_tensors      r)   rh   zEsmSelfOutput.forwards  .    

=1]3%4r+   ri   rj   rk   rN   rh   rq   rr   s   @r)   r   r   m      >
r+   r   c                   :     e Zd Zd fd	Z	 	 	 ddee   fdZ xZS )EsmAttentionc                     t         |           t        |||      | _        t	        |      | _        t        j                  |j                  |j                        | _        y )N)r   r   r   )
rM   rN   r   rV   r   outputr   r   r   r   )rV   r   r   r   rW   s       r)   rN   zEsmAttention.__init__{  sI    $VyUgh	#F+f&8&8f>S>STr+   r   c                     | j                  |      } | j                  |f|||d|\  }}| j                  ||      }|S )Nr   r   r   )r   rV   r   )	rV   r   r   r   r   r   hidden_states_lnr   r   s	            r)   rh   zEsmAttention.forward  sZ      >>-8"
)"7#9	

 
Q kk+}=r+   )NFr   )ri   rj   rk   rN   r   r   rh   rq   rr   s   @r)   r   r   z  s)    U "# +,r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )EsmIntermediatec                     t         |           t        j                  |j                  |j
                        | _        y r   )rM   rN   r   rx   r   intermediate_sizer   r   s     r)   rN   zEsmIntermediate.__init__  s,    YYv1163K3KL
r+   r   rf   c                 >    | j                  |      }t        |      }|S r   )r   r8   )rV   r   s     r)   rh   zEsmIntermediate.forward  s     

=1]+r+   ri   rj   rk   rN   r$   rm   rh   rq   rr   s   @r)   r  r    s$    MU\\ ell r+   r  c                   $     e Zd Z fdZd Z xZS )	EsmOutputc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
rM   rN   r   rx   r  r   r   r   r   r   r   s     r)   rN   zEsmOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r)   rh   zEsmOutput.forward  r   r+   r   rr   s   @r)   r  r    r   r+   r  c                   >     e Zd Z fdZ	 	 	 ddee   fdZd Z xZS )EsmLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        t        j                  |j                   |j"                        | _        y )Nr   z> should be used as a decoder model if cross attention is addedT)r   r   )rM   rN   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionRuntimeErrorcrossattentionr  intermediater  r   r   r   r   r   r   s     r)   rN   zEsmLayer.__init__  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v$"OD+F3'f&8&8f>S>STr+   r   c                      | j                   |fd|i|}| j                  r4|2t        | d      st        d|  d       | j                  |f|||d|}| j                  |      }|S )Nr   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )r  r   r   AttributeErrorr  feed_forward_chunk)rV   r   r   r   r   r   attention_outputlayer_outputs           r)   rh   zEsmLayer.forward  s     *4>>
)
 
 ??4@4!12$=dV D` ` 
  3t22  -&;'=	 
   ../?@r+   c                 n    | j                  |      }| j                  |      }| j                  ||      }|S r   )r   r  r   )rV   r  attention_output_lnintermediate_outputr  s        r)   r  zEsmLayer.feed_forward_chunk  s<    "nn-=>"//0CD{{#68HIr+   r   )	ri   rj   rk   rN   r   r   rh   r  rq   rr   s   @r)   r  r    s/    U$ "# +,@r+   r  c                   B     e Zd Z fdZe	 	 	 ddee   fd       Z xZS )
EsmEncoderc                 0   t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t        j                  |j                  |j                        | _        d| _        y c c}w )Nr   F)rM   rN   r   r   
ModuleListrangenum_hidden_layersr  layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rV   r   r   rW   s      r)   rN   zEsmEncoder.__init__  sm    ]]eFD\D\>]#^HV$4#^_
$&LL1C1CI^I^$_!&+# $_s   Br   c                     t        | j                        D ]  \  }} ||f|||d|} | j                  r| j                  |      }t        |      S )Nr   )last_hidden_state)	enumerater#  r$  r   )rV   r   r   r   r   r   ilayer_modules           r)   rh   zEsmEncoder.forward  sk      )4 	OA|(-&;'=	
 M	 $$ 55mDM1MRRr+   r   )	ri   rj   rk   rN   r   r   r   rh   rq   rr   s   @r)   r  r    s:    ,  "#S +,S Sr+   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	EsmPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rM   rN   r   rx   r   r   Tanhr{   r   s     r)   rN   zEsmPooler.__init__  s9    YYv1163E3EF
'')r+   r   rf   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r   r{   )rV   r   first_token_tensorpooled_outputs       r)   rh   zEsmPooler.forward  s6     +1a40

#566r+   r  rr   s   @r)   r,  r,    s#    $
U\\ ell r+   r,  c                        e Zd ZU eed<   dZdZdZg dZdgZ	dZ
dZdZdZe eedd	      g eedd
	      gdZ ej&                          fd       Zd Z xZS )EsmPreTrainedModelr   esmTF)r  #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightr   r  )index
layer_namer  )r   r   cross_attentionsc                 ^   t         |   |       t        |t              r t	        j
                  |j                         yt        |t              rZt	        j                  |j                  t        j                  |j                  j                  d         j                  d             yt        |t              rsddt        j                  d|j                  dt        j                         j#                         |j                  z  z  z  }t	        j                  |j$                  |       yy)	zInitialize the weightsr    r   r3   rJ   r   r   rK   N)rM   _init_weights
isinstance	EsmLMHeadinitzeros_r|   r   copy_r   r$   rO   r.   r   rG   r"   rP   rQ   rH   )rV   r   rH   rW   s      r)   r;  z EsmPreTrainedModel._init_weights*  s     	f%fi(KK$.JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh0eQ

AU[[(Y(_(_(adjdndn(nopHJJv1 1r+   c                      y r    rV   s    r)   get_output_embeddingsz(EsmPreTrainedModel.get_output_embeddings6  s     r+   )ri   rj   rk   r   rn   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr  r   r   _can_record_outputsr$   no_gradr;  rD  rq   rr   s   @r)   r4  r4    s    &*#\*F)G&N"& "%&6aKXY+1AQR
 U]]_	2 	2r+   r4  c                   R    e Zd ZdZd fd	Zd Zd Zeee		 	 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  dee   dee
j                     ez  fd                     Zd Zd Z xZS )EsmModela  

    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    c                    t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        t        |j                  |j                  z  d      | _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        NT)ru   r|   )rM   rN   r   r   r   r  encoderr,  poolerrt   r"  r   contact_head	post_init)rV   r   add_pooling_layerrW   s      r)   rN   zEsmModel.__init__J  sq    
 	 '/!&)+<i'$40063M3MMTX

 	r+   c                 .    | j                   j                  S r   r   r   rC  s    r)   get_input_embeddingszEsmModel.get_input_embeddings^  s    ...r+   c                 &    || j                   _        y r   rY  )rV   r   s     r)   set_input_embeddingszEsmModel.set_input_embeddingsa  s    */'r+   Nr   r   r   r   r   r   r   rf   c           
      r   |du |duz  rt        d      || j                  |||      }| j                  ||||t        j                  |j
                  d   |j                        d      \  }} | j                  |f|||d|}|d   }	| j                  | j                  |	      nd}
t        |	|
	      S )
aV  
        input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nz:You must specify exactly one of input_ids or inputs_embeds)r   r   r   r   rY   )r   r   embedding_outputr   cache_positionpast_key_valuesr   r   )r'  pooler_output)
r   r   _create_attention_masksr$   rO   r.   rZ   rS  rT  r   )rV   r   r   r   r   r   r   r   encoder_outputssequence_outputr2  s              r)   rh   zEsmModel.forwardd  s    > -t";<YZZ  !OO#-) , M 261M1M)#9*"7 <<(;(;A(>}G[G[\  2N 2
.. '$,,
)"7#9	

 
 *!,8<8OO4UY;-'
 	
r+   c                     | j                   j                  rt        | j                   ||||      }nt        | j                   ||      }|t        | j                   |||      }||fS )N)r   r   r   r_  r`  )r   r   r   )r   r   r   r   )r   r   r   r
   )rV   r   r   r^  r   r_  r`  s          r)   rb  z EsmModel._create_attention_masks  sx     ;;!!/{{.-- /N 7{{.-N "-%>{{.5&;	&" 555r+   c                 H    | ||dd      j                   }t        j                  |d      }||j                  d      j                  d      j                  d      z  }||j                  d      j                  d      j                  d      z  }| j	                  ||      S )NT)r   return_dictoutput_attentionsr   r!   r   r      )r   r$   stackr   rU  )rV   r   r   attnss       r)   predict_contactszEsmModel.predict_contacts  s    VN`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r+   )T)NNNNNN)ri   rj   rk   rl   rN   rZ  r\  r   r   r   r$   rm   r   r   rp   r   rh   rb  rl  rq   rr   s   @r)   rQ  rQ  <  s    
(/0   *..2,0-1596:?
<<$&?
 t+?
 llT)	?

 ||d*?
  %||d2?
 !&t 3?
 +,?
 
u||	K	K?
    ?
D 6D	0r+   rQ  c                   J    e Zd ZddiZ fdZd Zd Zee	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   deez  fd              Zd Z xZS )EsmForMaskedLMzlm_head.decoder.weightz%esm.embeddings.word_embeddings.weightc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.FrW  )
rM   rN   r   loggerwarningrQ  r5  r=  lm_headrV  r   s     r)   rN   zEsmForMaskedLM.__init__  sP     NN1
 Fe< (r+   c                 .    | j                   j                  S r   rs  decoderrC  s    r)   rD  z$EsmForMaskedLM.get_output_embeddings  s    ||###r+   c                 &    || j                   _        y r   ru  )rV   new_embeddingss     r)   set_output_embeddingsz$EsmForMaskedLM.set_output_embeddings  s    -r+   Nr   r   r   r   r   r   labelsr   rf   c           	      p    | j                   |f|||||d|}	|	d   }
| j                  |
      }d}|at               }|j                  |j                        } ||j                  d| j                  j                        |j                  d            }t        |||	j                  |	j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        )r   r   r   r   r   r   Nr    losslogitsr   r   )r5  rs  r   r]   rZ   r   r   r   r   r   r   )rV   r   r   r   r   r   r   rz  r   outputsrd  prediction_scoresmasked_lm_lossloss_fcts                 r)   rh   zEsmForMaskedLM.forward  s    ( $((
)%'"7#9
 
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r+   c                 <    | j                   j                  ||      S )N)r   )r5  rl  )rV   r   r   s      r)   rl  zEsmForMaskedLM.predict_contacts  s    xx(((OOr+   )NNNNNNN)ri   rj   rk   _tied_weights_keysrN   rD  ry  r   r   r$   
LongTensorrm   r   r   r   rp   r   rh   rl  rq   rr   s   @r)   rn  rn    s   24[\$.  .2.20426:>6:*.*
##d**
 t+*
 &&-	*

 ((4/*
  %0047*
 !&t 3*
   4'*
 +,*
 
	*
  *
XPr+   rn  c                   (     e Zd ZdZ fdZd Z xZS )r=  z&ESM Head for masked language modeling.c                    t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        y )Nr   F)r|   )rM   rN   r   rx   r   r   r   r   r   r   rv  	Parameterr$   zerosr|   r   s     r)   rN   zEsmLMHead.__init__$  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r+   c                     | j                  |      }t        |      }| j                  |      }| j                  |      | j                  z   }|S r   )r   r8   r   rv  r|   rV   featuresr   r&   s       r)   rh   zEsmLMHead.forward,  sD    JJx GOOA LLOdii'r+   ri   rj   rk   rl   rN   rh   rq   rr   s   @r)   r=  r=  !  s    0Ar+   r=  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                        e Zd Z fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Z xZS )EsmForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFrp  )	rM   rN   
num_labelsr   rQ  r5  EsmClassificationHead
classifierrV  r   s     r)   rN   z%EsmForSequenceClassification.__init__=  sH      ++Fe</7r+   Nr   r   r   r   rz  r   rf   c                     | j                   |f|||d|}|d   }| j                  |      }	d}
||j                  |	j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||	j                         |j                               }
n ||	|      }
n| j                  j
                  dk(  r=t               } ||	j                  d| j                        |j                  d            }
n,| j                  j
                  dk(  rt               } ||	|      }
t!        |
|	|j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   r   r   r   Nr   ry   single_label_classificationmulti_label_classificationr    r|  )r5  r  r]   rZ   r   problem_typer  rL   r$   r   ro   r   r   r   r   r   r   r   r   rV   r   r   r   r   rz  r   r  rd  r~  r}  r  s               r)   rh   z$EsmForSequenceClassification.forwardG  s   $ $((
)%'	

 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r+   NNNNN)ri   rj   rk   rN   r   r   r$   r  rm   r   r   r   rp   r   rh   rq   rr   s   @r)   r  r  6  s      .2.20426*.8
##d*8
 t+8
 &&-	8

 ((4/8
   4'8
 +,8
 
)	)8
  8
r+   r  c                        e Zd Z fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Z xZS )EsmForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r  )rM   rN   r  rQ  r5  r   r   r   r   rx   r   r  rV  r   s     r)   rN   z"EsmForTokenClassification.__init__  si      ++Fe<zz&"<"<=))F$6$68I8IJr+   Nr   r   r   r   rz  r   rf   c                 z    | j                   |f|||d|}|d   }| j                  |      }| j                  |      }	d}
|Wt               }|j	                  |	j
                        } ||	j                  d| j                        |j                  d            }
t        |
|	|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        r  r   Nr    r|  )r5  r   r  r   r]   rZ   r   r  r   r   r   r  s               r)   rh   z!EsmForTokenClassification.forward  s      $((
)%'	

 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r+   r  )ri   rj   rk   rN   r   r   r$   r  rm   r   r   r   rp   r   rh   rq   rr   s   @r)   r  r    s      .2.20426*.'
##d*'
 t+'
 &&-	'

 ((4/'
   4''
 +,'
 
&	&'
  '
r+   r  c                   (     e Zd ZdZ fdZd Z xZS )r  z-Head for sentence-level classification tasks.c                 &   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y r   )rM   rN   r   rx   r   r   r   r   r   r  out_projr   s     r)   rN   zEsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr+   c                     |d d dd d f   }| j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S r0  )r   r   r$   tanhr  r  s       r)   rh   zEsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r+   r  rr   s   @r)   r  r    s    7Ir+   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r!   )r~   ro   r$   cumsumr[   r   )r   r   maskincremental_indicess       r)   r   r     sP     <<$((*D,,t3;;DADH##%33r+   )rn  r  r  rQ  r4  )Nr   )Irl   r5   collections.abcr   r$   r   torch.nnr   r   r    r	   r>  masking_utilsr
   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_esmr   
get_loggerri   rq  r*   r1   r8   r;   rE   ModulerG   rt   r   rm   rQ   r   r   r   r   r  r  r  r  r,  r4  rQ  rn  r=  r  r  r  r   __all__rB  r+   r)   <module>r     s     $   A A & J 9  G & R R 7 E ( 
		H	%(
.;#
	*
ehhoo *
Z Gryy  GF\=BII \=L !%II%<<% 
% <<	%
 LL4'% T\% % '(%8M)ryy M)`
BII 
299 8bii 
		 
4) 4nS SB		  # # #L W0! W0 W0t FP' FP FPR		 * E
#5 E
E
P 4
 2 4
 4
nBII &4 r+   